In [213]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [214]:
from sklearn.datasets import make_checkerboard, make_classification

In [215]:
def score(x):
    if x['delivers_homework'] & x['attends_class'] & x['delivers_project']:
        return min(100, np.random.normal(95, 4))
    if x['delivers_homework'] & x['attends_class']:
        return min(100, np.random.normal(75, 4))
    if x['delivers_project'] & x['attends_class']:
        return min(100, np.random.normal(85, 4))
    if x['delivers_project'] & x['delivers_homework']:
        return min(100, np.random.normal(72, 4))
    if x['delivers_project']:
        return min(100, np.random.normal(65, 4))
    if x['delivers_homework']:
        return min(100, np.random.normal(58, 4))
    if x['attends_class']:
        return min(100, np.random.normal(52, 4))
    return min(100, np.random.normal(42, 7))


In [216]:
n = 500
classroom = pd.DataFrame({
    'delivers_homework': np.random.randint(2, size=n),
    'attends_class': np.random.randint(2, size=n),
    'delivers_project': np.random.randint(2, size=n)
})

In [217]:
classroom

Unnamed: 0,delivers_homework,attends_class,delivers_project
0,0,1,0
1,0,0,1
2,0,1,1
3,1,0,0
4,0,1,1
...,...,...,...
495,0,1,0
496,1,0,1
497,1,0,0
498,1,0,1


In [218]:
df = classroom.assign(score=classroom.apply(score, axis=1))

In [219]:
df.head()

Unnamed: 0,delivers_homework,attends_class,delivers_project,score
0,0,1,0,52.691065
1,0,0,1,59.28804
2,0,1,1,85.485524
3,1,0,0,53.140355
4,0,1,1,82.308468


In [220]:
df['passed'] = (df['score'] > 50).astype(int)

In [221]:
df['passed'].value_counts()

1    439
0     61
Name: passed, dtype: int64

In [222]:
df.drop('score', 1)

Unnamed: 0,delivers_homework,attends_class,delivers_project,passed
0,0,1,0,1
1,0,0,1,1
2,0,1,1,1
3,1,0,0,1
4,0,1,1,1
...,...,...,...,...
495,0,1,0,1
496,1,0,1,1
497,1,0,0,1
498,1,0,1,1


In [223]:
X = df.iloc[:, :3]
y = df['passed']

In [224]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate

In [225]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [226]:
classifier = RandomForestClassifier(n_estimators=100)

In [227]:
val = cross_validate(classifier, X_train, y_train, cv=10, return_train_score=True)
print('Train score', val['train_score'].mean())
print('Test score', val['test_score'].mean())

Train score 0.9511111111111111
Test score 0.9511111111111111


In [228]:
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [229]:
classifier.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [230]:
classifier.score(X_test, y_test)

0.94

In [231]:
new_classroom = pd.DataFrame({
    'delivers_homework': np.random.randint(2, size=n),
    'attends_class': np.random.randint(2, size=n),
    'delivers_project': np.random.randint(2, size=n)
})

In [232]:
n

500

In [233]:
new_classroom.to_csv('yeni_sinif.csv', index=False)

In [234]:
new_classroom['score'] = new_classroom.apply(score, axis=1)

In [235]:
new_classroom['passed'] = (new_classroom['score'] > 50.).astype(int)

In [236]:
new_classroom['passed'].value_counts()

1    431
0     69
Name: passed, dtype: int64

In [237]:
new_classroom[['score', 'passed']].to_csv('yeni_sinif_sonuclar.csv', index=False)

In [238]:
new_classroom.head()

Unnamed: 0,delivers_homework,attends_class,delivers_project,score,passed
0,1,1,0,79.308462,1
1,1,0,1,72.792481,1
2,0,0,1,59.929949,1
3,0,0,1,68.991305,1
4,0,1,0,53.599245,1


In [239]:
y_pred = classifier.predict(new_classroom.iloc[:, :3])

In [240]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve

In [241]:
accuracy_score(new_classroom['passed'], y_pred)

0.944

In [242]:
precision_score(new_classroom['passed'], y_pred, pos_label=0)

0.8727272727272727

In [243]:
recall_score(new_classroom['passed'], y_pred, pos_label=0)

0.6956521739130435

In [244]:
f1_score(new_classroom['passed'], y_pred, pos_label=0)

0.7741935483870968

## Precision, Recall and F1 scores

In [245]:
classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', class_weight={0:8, 1:1})

In [246]:
classifier.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={0: 8, 1: 1},
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [247]:
recall_score(new_classroom['passed'], classifier.predict(new_classroom.iloc[:, :3]), pos_label=0)

0.9855072463768116

In [248]:
precision_score(new_classroom['passed'], classifier.predict(new_classroom.iloc[:, :3]), pos_label=0)

0.53125

In [249]:
accuracy_score(new_classroom['passed'], classifier.predict(new_classroom.iloc[:, :3]))

0.878

In [250]:
f1_score(new_classroom['passed'], classifier.predict(new_classroom.iloc[:, :3]), pos_label=0)

0.6903553299492386

In [251]:
import pickle

In [252]:
pickle.dump(classifier, open('trained_classifier.pk', 'wb'))

## Compare to other models

In [253]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_text

### Logistic Regression

In [254]:
logistic = LogisticRegression(solver='lbfgs')

In [255]:
logistic.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [256]:
logistic.score(X_test, y_test)

0.94

In [257]:
logistic.score(new_classroom.iloc[:, :3], new_classroom['passed'])

0.944

### Decision Tree

In [258]:
tree = DecisionTreeClassifier()

In [259]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [260]:
tree.score(X_test, y_test)

0.94

In [261]:
tree.score(new_classroom.iloc[:, :3], new_classroom['passed'])

0.944

In [262]:
print(export_text(tree, feature_names=new_classroom.columns.tolist()[:3]))

|--- delivers_homework <= 0.50
|   |--- delivers_project <= 0.50
|   |   |--- attends_class <= 0.50
|   |   |   |--- class: 0
|   |   |--- attends_class >  0.50
|   |   |   |--- class: 1
|   |--- delivers_project >  0.50
|   |   |--- class: 1
|--- delivers_homework >  0.50
|   |--- attends_class <= 0.50
|   |   |--- delivers_project <= 0.50
|   |   |   |--- class: 1
|   |   |--- delivers_project >  0.50
|   |   |   |--- class: 1
|   |--- attends_class >  0.50
|   |   |--- class: 1



In [264]:
pd.DataFrame({
    'feature': new_classroom.columns[:3],
    'importance': classifier.feature_importances_,
})

Unnamed: 0,feature,importance
0,delivers_homework,0.384558
1,attends_class,0.113817
2,delivers_project,0.501625


In [263]:
pd.DataFrame({
    'feature': new_classroom.columns[:3],
    'importance': tree.feature_importances_,
})

Unnamed: 0,feature,importance
0,delivers_homework,0.19912
1,attends_class,0.372479
2,delivers_project,0.428401
