In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [None]:
df = pd.read_csv("/kaggle/input/taxol-drug-resistance-cell-lines-in-breast-cancer/Dataset.csv")

In [None]:
df.head()

In [None]:
df.shape

## Data Distribution

LogFC feature is Normally distributed. Histogram is normal

P.Value features has Positively Skewed histogram

In [None]:
df['P.Value'].plot.hist(bins=15)
plt.xlabel("P.Values")

In [None]:
df['logFC'].plot.hist(bins=15)
plt.xlabel("logFC")

In [None]:
sns.pairplot(df, vars=['P.Value', 'logFC'], hue='Cell Line', corner=True)

### No class imbalance, no need for SMOTE

In [None]:
df['Cell Line'].value_counts()

In [None]:
le = LabelEncoder()
df['Cell Line'] = le.fit_transform(df['Cell Line'])

In [None]:
x = df.iloc[:, [1, 2]].values
y = df.iloc[:, -1].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.3)

In [None]:
def model_test(model, acc, title, show=True):
    y_pred = model.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    score *= 100
    score = round(score, acc)
    if show:
        cm = confusion_matrix(y_pred, y_test)
        sns.heatmap(cm, annot=True)
        plt.title("{}: {}%".format(title, score))
        plt.show()
    else:
        print("{}: {}%".format(title, score))

In [None]:
def model_train(model, name):
    model.fit(x_train, y_train)
    model_test(model, 2, name, False)
    return model

In [None]:
rfc = RandomForestClassifier(n_estimators = 100, max_depth=250)
svc = SVC(C=1,kernel='rbf')
gnb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=10)
dtc = DecisionTreeClassifier(max_depth=250)
abc = AdaBoostClassifier(n_estimators=100,learning_rate=0.5)
names = ['Random Forest Classifier', 'Support Vector Machine',
        'Gaussian Naive Bayes', 'KNeighborsClassifier',
        'Decision Tree Classifier', 'Ada Boost Classifier']
models = [rfc, svc, gnb, knn, dtc, abc]

In [None]:
mls = []
for i, j in zip(names, models):
    mls.append(model_train(j, i))

# TOP-3 Classification models:

* ## Random Forest Classifier
* ## DecisionTreeClassifier/KNeighborsClassifier
* ## AdaBoostClassifier

In [None]:
model_test(mls[0], 4, models[0])

In [None]:
model_test(mls[3], 4, models[3])

In [None]:
model_test(mls[4], 4, models[4])

In [None]:
model_test(mls[-1], 4, models[-1])