In [None]:
#FOR MERGING THE DATASETS TO FILTER UNIQUE SMILES AND SMILE CONTAINING TWO DIFFERENT VALUES KEEP THEM AS AS UNIQUE SMILES AS THEY MAY BE A ENANTIOMERS

import pandas as pd

# Load datasets
df1 = pd.read_csv("/content/dataset1.csv")
df2 = pd.read_csv("/content/dataset2.csv")

# Print dataset details before merging
print("Before Merging:")
print(f"Rows in df1: {len(df1)}, Columns in df1: {df1.shape[1]}")
print(f"Rows in df2: {len(df2)}, Columns in df2: {df2.shape[1]}")
print(f"Unique SMILES in df1: {df1['smiles'].nunique()}")
print(f"Unique SMILES in df2: {df2['smiles'].nunique()}")

# Check missing values per column before merging
print("\nMissing values per column in df1:")
print(df1.isnull().sum())

print("\nMissing values per column in df2:")
print(df2.isnull().sum())

# Merge datasets while preserving duplicate SMILES entries
df_merged = pd.concat([df1, df2], ignore_index=True)

# Handle duplicate columns (excluding 'smiles')
common_cols = set(df1.columns) & set(df2.columns) - {"smiles"}

for col in common_cols:
    col_x, col_y = f"{col}_x", f"{col}_y"
    if col_x in df_merged.columns and col_y in df_merged.columns:
        df_merged[col] = df_merged[col_x].combine_first(df_merged[col_y])
        df_merged.drop(columns=[col_x, col_y], inplace=True)

# Print dataset details after merging
print("\nAfter Merging:")
print(f"Rows in Merged Dataset: {len(df_merged)}, Columns in Merged Dataset: {df_merged.shape[1]}")
print(f"Unique SMILES in Merged Dataset: {df_merged['smiles'].nunique()}")

# Check missing values per column after merging
print("\nMissing values per column after merging:")
print(df_merged.isnull().sum())

# Save the final dataset
df_merged.to_csv("OUTPUT.csv", index=False)

print("\nFinal dataset saved successfully!")


In [None]:
#FOR REMOVING EMPTY FILES
import pandas as pd

# Read the CSV file
df = pd.read_csv('DATASET.csv')

# Remove rows where all columns (except 'smiles') have NaN values
df_cleaned = df.dropna(subset=df.columns[1:], how='all')

# Save the cleaned dataset to a new CSV file
df_cleaned.to_csv('OUTPUT.csv', index=False)

print("Rows with no values have been removed.")

In [None]:
#FOR THE DATASETS TO REMOVE ALL DUPLICATE SMILES AND VALUES

import pandas as pd

# Load datasets
df1 = pd.read_csv("/content/dataset1.csv")
df2 = pd.read_csv("/content/DATASET2.csv")

# Print dataset details before merging
print("Before Merging:")
print(f"Rows in df1: {len(df1)}, Columns in df1: {df1.shape[1]}")
print(f"Rows in df2: {len(df2)}, Columns in df2: {df2.shape[1]}")
print(f"Unique SMILES in df1: {df1['smiles'].nunique()}")
print(f"Unique SMILES in df2: {df2['smiles'].nunique()}")

# Check missing values per column before merging
print("\nMissing values per column in df1:")
print(df1.isnull().sum())

print("\nMissing values per column in df2:")
print(df2.isnull().sum())

# Merge datasets
df_merged = pd.merge(df1, df2, on="smiles", how="outer")

# Handle duplicate columns
for col in df1.columns:
    if col in df2.columns and col != "smiles":  # Exclude 'smiles'
        col_x, col_y = f"{col}_x", f"{col}_y"
        if col_x in df_merged.columns and col_y in df_merged.columns:
            df_merged[col] = df_merged[col_x].combine_first(df_merged[col_y])
            df_merged.drop(columns=[col_x, col_y], inplace=True)

# Print dataset details after merging
print("\nAfter Merging:")
print(f"Rows in Merged Dataset: {len(df_merged)}, Columns in Merged Dataset: {df_merged.shape[1]}")
print(f"Unique SMILES in Merged Dataset: {df_merged['smiles'].nunique()}")

# Check if any columns were lost or added
print("\nColumns added after merging:", set(df_merged.columns) - set(df1.columns) - set(df2.columns))
print("Columns missing after merging:", set(df1.columns) | set(df2.columns) - set(df_merged.columns))

# Check missing values per column after merging
print("\nMissing values per column after merging:")
print(df_merged.isnull().sum())

# Save the final dataset
df_merged.to_csv("output.csv", index=False)

print("\nFinal dataset saved successfully!")


In [None]:
#to get total colums with entered values and missing values

import pandas as pd

# Load the CSV file
file_path = "/content/output.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Calculate counts
total_rows = len(df)  # Total number of rows
entered_values = df.count()  # Non-null values per column
missing_values = total_rows - entered_values  # Null values per column

# Combine into a DataFrame for better readability
summary = pd.DataFrame({
    "Column Name": df.columns,
    "Entered Values": entered_values,
    "Missing Values": missing_values
})

# Print the summary
print(summary)



       Column Name  Entered Values  Missing Values
smiles      smiles            1513               0
pIC50        pIC50            1513               0
