In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df=pd.read_csv('student.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,TENTH,TWELTH,CGPA,PLACED,GENDER
0,0,95,55.0,7.6,0,1
1,1,98,93.2,8.5,1,0
2,2,87,81.0,9.01,1,0
3,3,55,78.0,6.7,0,1
4,4,99,94.0,8.3,1,0


In [4]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Unnamed: 0','PLACED'],axis=1), df['PLACED'], test_size=0.4, random_state=101)

# Logistic Regression

In [7]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [34]:
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l1', 'l2', 'elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.907490 using {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'l1', 'solver': 'lbfgs'}
0.907490 (0.036430) with: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.907490 (0.036430) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.907490 (0.036430) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.907490 (0.036430) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'elasticnet', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'elasticnet', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 10, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 10, 'penalty': 'l1', 'solver': 'lbfgs'}
0.906985 (0.036088) with: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}

In [35]:
logmodel = LogisticRegression(C=100,penalty='l1',solver='liblinear')
logmodel.fit(X_train,y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
predictions = logmodel.predict(X_test)

In [37]:
predictions

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,

In [38]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.78      0.81      0.79       185
           1       0.85      0.84      0.85       253

    accuracy                           0.82       438
   macro avg       0.82      0.82      0.82       438
weighted avg       0.83      0.82      0.82       438



In [39]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

array([[149,  36],
       [ 41, 212]], dtype=int64)

# Linear Regression

In [40]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()

In [43]:
LR.fit(X_train,y_train)
LR_preds=LR.predict(X_test)

In [47]:
LR_df = pd.DataFrame({'Actual': y_test, 'Predicted': LR_preds})
LR_df

Unnamed: 0,Actual,Predicted
655,1,0.833106
698,0,0.124510
1035,1,1.079018
1037,1,1.062618
1006,0,0.830639
...,...,...
368,1,1.109843
252,1,0.844371
363,0,-0.055662
196,0,0.830639


# Random Forest Algo

In [17]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
rfc_pred = rfc.predict(X_test)

In [19]:
from sklearn.metrics import classification_report, confusion_matrix

In [20]:
print(confusion_matrix(y_test,rfc_pred))
print(classification_report(y_test,rfc_pred))

[[185   0]
 [  4 249]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       185
           1       1.00      0.98      0.99       253

    accuracy                           0.99       438
   macro avg       0.99      0.99      0.99       438
weighted avg       0.99      0.99      0.99       438



In [21]:
from sklearn.ensemble import RandomForestRegressor

#rfr = RandomForestRegressor(n_estimators=20, random_state=0)
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr_pred = rfr.predict(X_test)

In [22]:
rfr_df = pd.DataFrame({'Actual': y_test, 'Predicted': rfr_pred})
rfr_df

Unnamed: 0,Actual,Predicted
655,1,0.99
698,0,0.02
1035,1,1.00
1037,1,1.00
1006,0,0.09
...,...,...
368,1,1.00
252,1,1.00
363,0,0.00
196,0,0.09


In [23]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, rfr_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, rfr_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rfr_pred)))

Mean Absolute Error: 0.039452054794520554
Mean Squared Error: 0.016919178082191778
Root Mean Squared Error: 0.13007374094025195


In [24]:
rfr_df['Predicted'].unique()

array([0.99, 0.02, 1.  , 0.09, 0.97, 0.73, 0.  , 0.01, 0.89, 0.95, 0.91,
       0.04, 0.98, 0.05, 0.07, 0.93, 0.49, 0.27, 0.19, 0.81, 0.92, 0.15,
       0.38, 0.6 , 0.96, 0.68, 0.4 , 0.03, 0.1 ])

In [25]:
#create joblib file of random forest regressor
from joblib import dump,load
dump(rfr,'RandomForestRegressor.joblib')

['RandomForestRegressor.joblib']