In [1]:
import pandas as pd

## Loading the data

In [2]:
df = pd.read_csv('../data/data.csv')

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
type_school,Academic,Academic,Academic,Vocational,Academic
school_accreditation,A,A,B,B,A
gender,Male,Male,Female,Male,Female
interest,Less Interested,Less Interested,Very Interested,Very Interested,Very Interested
residence,Urban,Urban,Urban,Rural,Urban
parent_age,56,57,50,49,57
parent_salary,6950000,4410000,6500000,6600000,5250000
house_area,83.0,76.8,80.6,78.2,75.1
average_grades,84.09,86.91,87.43,82.12,86.79
parent_was_in_college,False,False,False,True,False


In [6]:
len(df)

1000

In [8]:
df.isnull().sum()

type_school              0
school_accreditation     0
gender                   0
interest                 0
residence                0
parent_age               0
parent_salary            0
house_area               0
average_grades           0
parent_was_in_college    0
in_college               0
dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

In [68]:
df_train_all, df_test = train_test_split(df, test_size=0.2, random_state=1)

### EDA

In [69]:
df_train_all.in_college.value_counts()

True     405
False    395
Name: in_college, dtype: int64

In [70]:
y_train_all = df_train_all.in_college.astype(int).values
y_test = df_test.in_college.astype(int).values

In [71]:
y_train_all.mean()

0.50625

In [72]:
categorical = [
    'type_school', 'school_accreditation', 'gender', 'interest', 'residence',
    'parent_was_in_college'
]

In [73]:
df_train_all[categorical].describe()

Unnamed: 0,type_school,school_accreditation,gender,interest,residence,parent_was_in_college
count,800,800,800,800,800,800
unique,2,2,2,5,2,2
top,Academic,B,Male,Very Interested,Urban,True
freq,488,417,401,262,436,416


In [74]:
numerical = ['parent_age', 'parent_salary', 'house_area', 'average_grades']

In [75]:
df_train_all[numerical].describe().round(2)

Unnamed: 0,parent_age,parent_salary,house_area,average_grades
count,800.0,800.0,800.0,800.0
mean,52.24,5372650.0,74.43,86.05
std,3.47,1387610.92,14.95,3.34
min,40.0,1000000.0,20.0,75.0
25%,50.0,4367500.0,64.6,83.72
50%,52.0,5430000.0,75.3,85.5
75%,54.0,6382500.0,84.6,88.19
max,65.0,10000000.0,116.3,98.0


Now let's see which features are important

In [76]:
from sklearn.metrics import mutual_info_score

In [77]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_train_all.in_college)

mi = df_train_all[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

interest                 0.039574
school_accreditation     0.001280
parent_was_in_college    0.001107
residence                0.000351
gender                   0.000313
type_school              0.000205
dtype: float64

It looks like the main categorical factor is the interest of the student

In [78]:
df_train_all[numerical].corrwith(df_train_all.in_college).abs()

parent_age        0.050370
parent_salary     0.481177
house_area        0.443250
average_grades    0.512619
dtype: float64

Parent's salary and house area are probably correlated, and grades seem to be the most contributing factor

In [79]:
df_train_all[['parent_salary']].corrwith(df_train_all.house_area)

parent_salary    0.124039
dtype: float64

It's actually not very correlated - so both should be good features.

Let's train our first model. We'll use all features and then only a subset 

## Training model

In [81]:
df_train, df_val = train_test_split(df_train_all, test_size=0.25, random_state=1)

In [84]:
y_train = df_train.in_college.astype(int).values
y_val = df_val.in_college.astype(int).values

In [85]:
from sklearn.feature_extraction import DictVectorizer

In [87]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [88]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [89]:
from sklearn.linear_model import LogisticRegression

In [90]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [91]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [95]:
y_pred = lr.predict_proba(X_val)[:, 1]

In [96]:
from sklearn.metrics import roc_auc_score

In [97]:
roc_auc_score(y_val, y_pred)

0.7903161264505801

AUC is almost 80% - not bad!