In [105]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [106]:
%matplotlib inline
sns.set_style('whitegrid')

In [107]:
data=pd.read_csv("problem_1/train.csv")

In [108]:
data.describe()

Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
count,466.0,466.0,466.0,466.0,466.0,466.0,466.0,466.0,463.0,466.0
mean,45.321888,3.406438,1.503004,294.877682,82.633047,114.982833,6.491845,3.149356,0.954104,1.285408
std,15.697936,6.510448,2.8237,252.718859,192.8019,312.869109,1.096455,0.805285,0.333646,0.452094
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,33.0,0.8,0.2,176.0,23.0,25.0,5.8,2.6,0.7,1.0
50%,46.0,1.0,0.3,210.0,36.0,42.0,6.6,3.1,0.96,1.0
75%,58.0,2.575,1.3,298.0,60.0,88.0,7.2,3.8,1.1,2.0
max,85.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


# check null values in data

In [109]:
data.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    3
Dataset                       0
dtype: int64

# filling null values with mean values

In [110]:
data.fillna(data.mean(), inplace=True)

In [111]:
data.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

# analysing data by dataset 1 and dataset 2

In [112]:
pd.set_option('display.max_rows',500)

In [113]:
data.groupby('Dataset').describe().transpose()

Unnamed: 0,Dataset,1,2
Age,count,333.0,133.0
Age,mean,46.675676,41.932331
Age,std,15.11731,16.646137
Age,min,8.0,4.0
Age,25%,34.0,28.0
Age,50%,47.0,41.0
Age,75%,58.0,56.0
Age,max,78.0,85.0
Total_Bilirubin,count,333.0,133.0
Total_Bilirubin,mean,4.298198,1.173684


# as the difference between the mean values of  Alkaline_Phosphotase ,Alamine_Aminotransferase ,Aspartate_Aminotransferase is big between in dataset1 and dataset2 so a new column of addition of these 3 columns 

In [114]:
data['sum3']=data.Alkaline_Phosphotase+data.Alamine_Aminotransferase+data.Aspartate_Aminotransferase

# Applying DecisionTreeClassifier

In [140]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [147]:
X=data[['Alkaline_Phosphotase','Alamine_Aminotransferase','Aspartate_Aminotransferase','sum3','Direct_Bilirubin']]
y=data['Dataset']
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1)

In [148]:
clf = DecisionTreeClassifier(random_state=1)

clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=1, splitter='best')

In [119]:
y_pred=clf.predict(X_test)

In [120]:
x= y_test==y_pred

In [121]:
x.value_counts()/len(x)

True     0.632479
False    0.367521
Name: Dataset, dtype: float64

# Applying RandomForestClassifier

In [122]:
from sklearn.ensemble import RandomForestClassifier


In [123]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

In [124]:
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [125]:
y_pred=clf.predict(X_test)

In [126]:
x= y_test==y_pred

In [127]:
x.value_counts()/len(x)

True     0.74359
False    0.25641
Name: Dataset, dtype: float64

# Scaling the data

In [152]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# Applying GradientBoostingClassifier

In [129]:
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingClassifier

In [130]:
seed = 2
num_trees = 100
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print(results.mean())

0.6913506012950971


# Applying LogisticRegression

In [164]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(C=0.0001,max_iter=100000,solver='saga',warm_start=True,penalty='l2',random_state=1)

In [165]:
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
x= y_test==y_pred
x.value_counts()/len(x)

True     0.769231
False    0.230769
Name: Dataset, dtype: float64

In [131]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
x= y_test==y_pred
x.value_counts()/len(x)

True     0.74359
False    0.25641
Name: Dataset, dtype: float64

# Applying Bagging and  DecisionTreeClassifier

In [93]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [167]:
seed = 5
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print(results.mean())

0.706290471785384


# accuracy with LogisticRegression is best so now train whole data and test on test file


In [168]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(C=0.0001,max_iter=100000,solver='saga',warm_start=True,penalty='l2',random_state=1)
clf.fit(X,y)

LogisticRegression(C=0.0001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='saga', tol=0.0001, verbose=0,
                   warm_start=True)