# 🔬 Signature Drift in Cancer Cells
Bioinformatics project using PCA and t-test to measure how gene expression in cancer cells differs from normal cells.

In [None]:
# 📦 Step 1: Install Required Libraries
!pip install pandas numpy matplotlib seaborn scikit-learn scipy

In [None]:
# 📂 Step 2: Upload Your Dataset
from google.colab import files
uploaded = files.upload()

In [None]:
# 📊 Step 3: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from scipy.stats import ttest_ind

In [None]:
# 📈 Step 4: Load the Dataset
df = pd.read_csv('lung_cancer.csv')
df.head()

In [None]:
# 🧬 Step 5: Perform PCA
features = df.drop('Class', axis=1)
labels = df['Class']
pca = PCA(n_components=2)
pca_result = pca.fit_transform(features)
pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
pca_df['Class'] = labels.values
pca_df['DriftScore'] = np.sqrt(pca_df['PC1']**2 + pca_df['PC2']**2)
pca_df.to_csv("drift_output.csv", index=False)
pca_df.head()

In [None]:
# 📊 Step 6: Visualize PCA
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Class', s=100)
plt.title('PCA of Gene Expression')
plt.savefig("pca_plot.png")
plt.show()

In [None]:
# 📉 Step 7: Boxplot of Drift Score
plt.figure(figsize=(8, 6))
sns.boxplot(data=pca_df, x='Class', y='DriftScore')
plt.title('Drift Score by Class')
plt.savefig("drift_score_boxplot.png")
plt.show()

In [None]:
# 🧪 Step 8: T-Test
normal = pca_df[pca_df['Class'] == 'Normal']['DriftScore']
cancer = pca_df[pca_df['Class'] == 'Cancer']['DriftScore']
t_stat, p_val = ttest_ind(cancer, normal)
print(f"T-statistic: {t_stat:.3f}, P-value: {p_val:.5f}")