In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
#Loading the data
df = pd.read_csv('assignment_train.csv')
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,target
0,0,0,14.0,0,13,0,0,0.010945,2472.34595,-0.115705,0.862304,56.574546,0.666613,0
1,3,0,14.0,0,16,8,0,-0.236723,1758.632335,-0.456224,0.882355,7.042942,-0.845667,1
2,0,0,0.0,0,10,1,0,-0.274587,9.101218,-0.378354,0.987417,-12.720269,-0.32201,1
3,0,0,13.0,0,12,1,0,-0.399222,1051.140336,-1.022416,0.906931,129.916224,0.826691,0
4,0,0,,0,0,7,0,-0.086009,-254.164304,1.098035,1.215128,37.566795,0.700999,1


In [4]:
#Cleaning data
#Displaying number of null values in the above dataframe
print(df.isna().sum())

col_0       0
col_1       0
col_2     389
col_3       0
col_4       0
col_5       0
col_6       0
col_7       0
col_8       0
col_9      66
col_10    101
col_11    163
col_12    163
target      0
dtype: int64


In [5]:
#filling missing values by taking mode for col_2 as it is a categorical feature
df['col_2'].fillna(df['col_2'].mode()[0], inplace=True)

#filling missing values by taking interpolation for col_9 and col_12 as it is a continuous numerical variable and most values lie close to each other
df[['col_9','col_12']] = df[['col_9','col_12']].interpolate(method='linear')

#filling missing values by taking median for col_10 and col_11
df[['col_10','col_11']]= df[['col_10','col_11']].fillna(df[['col_10','col_11']].mean())
df.head()
print(df.isna().sum())

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,target
0,0,0,14.0,0,13,0,0,0.010945,2472.34595,-0.115705,0.862304,56.574546,0.666613,0
1,3,0,14.0,0,16,8,0,-0.236723,1758.632335,-0.456224,0.882355,7.042942,-0.845667,1
2,0,0,0.0,0,10,1,0,-0.274587,9.101218,-0.378354,0.987417,-12.720269,-0.32201,1
3,0,0,13.0,0,12,1,0,-0.399222,1051.140336,-1.022416,0.906931,129.916224,0.826691,0
4,0,0,0.0,0,0,7,0,-0.086009,-254.164304,1.098035,1.215128,37.566795,0.700999,1


col_0     0
col_1     0
col_2     0
col_3     0
col_4     0
col_5     0
col_6     0
col_7     0
col_8     0
col_9     0
col_10    0
col_11    0
col_12    0
target    0
dtype: int64


In [6]:
#defining features and target
X = df.drop('target', axis=1)
y = df['target']

#standardizing features to ensure all values are on the same range
scl = StandardScaler()
X = scl.fit_transform(X)
print(X)

[[-0.62958552 -0.17752815  1.2774274  ... -0.34609026  0.65049582
   0.98722707]
 [ 0.34950755 -0.17752815  1.2774274  ... -0.3169297  -0.04501845
  -1.50809401]
 [-0.62958552 -0.17752815 -0.89435448 ... -0.16414091 -0.32253007
  -0.64403932]
 ...
 [-0.62958552 -0.17752815  0.50179101 ...  0.32428401  0.67008976
  -0.55177423]
 [-0.62958552 -0.17752815  1.58768195 ...  0.26004373 -0.06429332
   0.06528787]
 [ 3.28678676 -0.17752815 -0.89435448 ...  0.02780512 -0.32959855
  -1.63547631]]


In [7]:
#split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

In [8]:
#feature selection using ANOVA and KBest
selected_features = SelectKBest(score_func=f_classif, k=2)
X_train_sel = selected_features.fit_transform(X_train, y_train)
X_test_sel = selected_features.transform(X_test)
print(X_train_sel.shape)

(2876, 2)


  f = msb / msw


In [9]:
#Random forest classification with grid search
rf_parameters = {
    'n_estimators': [25, 50, 100, 150],
    'max_depth': [10, 15, 20],
    'max_leaf_nodes': [5, 10, 15],
}

#fitting random forest model
rf_grid_search = GridSearchCV(RandomForestClassifier(),param_grid = rf_parameters)
rf_grid_search.fit(X_train_sel, y_train)

#random forest predictions based on the grid search parameters
rf_y_pred = rf_grid_search.best_estimator_.predict(X_test_sel)

print("F1 score for random forest: ",f1_score(y_test, rf_y_pred, average = 'macro'))

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [10, 15, 20],
                         'max_leaf_nodes': [5, 10, 15],
                         'n_estimators': [25, 50, 100, 150]})

F1 score for random forest:  0.8074211732325348


In [10]:
#SVM classification with grid search
#parameters for grid search
svm_parameters = {'C': [0.01, 0.1, 1, 10, 100], 
              'gamma': [0.01, 0.1, 1, 10, 100],
              'kernel': ['rbf']} 

#fitting svm model
svm_grid_search = GridSearchCV(SVC(), param_grid = svm_parameters, refit = True, verbose = 3)
svm_grid_search.fit(X_train_sel, y_train)

#SVM predictions based on the grid search parameters
svm_y_pred = svm_grid_search.best_estimator_.predict(X_test_sel)

print("F1 score for SVM: ",f1_score(y_test, svm_y_pred, average='macro'))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ....C=0.01, gamma=0.01, kernel=rbf;, score=0.474 total time=   0.6s
[CV 2/5] END ....C=0.01, gamma=0.01, kernel=rbf;, score=0.477 total time=   0.6s
[CV 3/5] END ....C=0.01, gamma=0.01, kernel=rbf;, score=0.477 total time=   0.5s
[CV 4/5] END ....C=0.01, gamma=0.01, kernel=rbf;, score=0.477 total time=   0.5s
[CV 5/5] END ....C=0.01, gamma=0.01, kernel=rbf;, score=0.477 total time=   0.5s
[CV 1/5] END .....C=0.01, gamma=0.1, kernel=rbf;, score=0.632 total time=   0.5s
[CV 2/5] END .....C=0.01, gamma=0.1, kernel=rbf;, score=0.637 total time=   0.5s
[CV 3/5] END .....C=0.01, gamma=0.1, kernel=rbf;, score=0.640 total time=   0.5s
[CV 4/5] END .....C=0.01, gamma=0.1, kernel=rbf;, score=0.637 total time=   0.5s
[CV 5/5] END .....C=0.01, gamma=0.1, kernel=rbf;, score=0.652 total time=   0.5s
[CV 1/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.755 total time=   0.4s
[CV 2/5] END .......C=0.01, gamma=1, kernel=rbf

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.01, 0.1, 1, 10, 100],
                         'gamma': [0.01, 0.1, 1, 10, 100], 'kernel': ['rbf']},
             verbose=3)

F1 score for SVM:  0.8063488028634375


In [11]:
#logistic regression with grid search
#parameters for grid search
lr_parameters = {'C': [0.01, 0.1, 1, 10, 100], 
              'penalty': ['l1','l2']} 

#fitting svm model
lr_grid_search = GridSearchCV(LogisticRegression(), param_grid = lr_parameters)
lr_grid_search.fit(X_train_sel, y_train)

#Logistic regression predictions based on the grid search parameters
lr_y_pred = lr_grid_search.best_estimator_.predict(X_test_sel)

print("F1 score for Logistic Regression: ",f1_score(y_test, lr_y_pred, average='macro'))

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/aparnabimal/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/aparnabimal/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/aparnabimal/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' pe

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']})

F1 score for Logistic Regression:  0.8052032374648398
