# Predicting myopia in a Children

In [1]:
import pickle
import pandas as pd
import requests

Data Variables
* id: Study ID
* studyyear: Year subject entered the study
* myopic: Myopia within the first five years of follow up (No/Yes)
* age: Age at first visit (years)
* gender: Gender (Male/Female)
* spheq: Spherical Equivalent Refraction (diopter)
* al: Axial Length (mm)
* acd: Anterior Chamber Depth (mm)
* lt: Lens Thickness (mm)
* vcd: Vitreous Chamber Depth (mm)
* sporthr: Time spent engaging in sports/outdoor activities (hours per week)
* readhr:Time spent reading for pleasure (hours per week)
* comphr:Time spent playing video/computer games or working on the computer (hours per week)
* studyhr:Time spent reading or studying for school assignments (hours per week)
* tvhr: Time spent watching television (hours per week)
* diopterhr: Composite of near-work activities (hours per week)
* mommy: Was the subject’s mother myopic? (No/Yes)
* dadmy: Was the subject’s father myopic? (No/Yes)


In [2]:
myopia = pd.read_csv('MYOPIA.csv')   
myopia.head()

Unnamed: 0,ID,STUDYYEAR,MYOPIC,AGE,GENDER,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,1,1992,1,6,1,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1
1,2,1995,0,6,1,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1
2,3,1991,0,6,1,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0
3,4,1990,1,6,1,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1
4,5,1995,0,5,0,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0


In [3]:
target = 'MYOPIC'
features = list(myopia.columns)
features.remove(target)
features.remove('ID')
features.remove('STUDYYEAR')

In [4]:
myopia.AGE.unique()

array([6, 5, 7, 8, 9], dtype=int64)

In [5]:
myopia.GENDER.unique()
myopia.DADMY.unique()

array([1, 0], dtype=int64)

In [6]:
dummie = pd.get_dummies(myopia['AGE'], prefix = 'AGE')
names = list(dummie.columns)
names.remove(names[0])
features.remove('AGE')
myopia_dummy = pd.concat([myopia[features], dummie[names]], axis = 1)

In [7]:
from sklearn.linear_model import LinearRegression
def calculateVIF(data):
    features = list(data.columns)
    num_features = len(features)
    
    model = LinearRegression()
    
    result = pd.DataFrame(index = ['VIF'], columns = features)
    result = result.fillna(0)
    
    for ite in range(num_features):
        x_features = features[:]
        y_featue = features[ite]
        x_features.remove(y_featue)
        
        x = data[x_features]
        y = data[y_featue]
        
        model.fit(data[x_features], data[y_featue])
        
        if model.score(data[x_features], data[y_featue]) == 1:
            result[y_featue] = 9999999999999
        else:
            result[y_featue] = 1/(1 - model.score(data[x_features], data[y_featue]))
    
    return result

def selectDataUsingVIF(data, max_VIF = 5):
    result = data.copy(deep = True)
    
    VIF = calculateVIF(result)
    
    while VIF.as_matrix().max() > max_VIF:
        col_max = np.where(VIF == VIF.as_matrix().max())[1][0]
        features = list(result.columns)
        features.remove(features[col_max])
        result = result[features]
        
        VIF = calculateVIF(result)
        
    return result

In [8]:
calculateVIF(myopia_dummy)

Unnamed: 0,GENDER,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,AGE_6,AGE_7,AGE_8,AGE_9
VIF,1.307391,1.233264,30990.423688,3553.695725,1601.827052,29593.71668,1.088203,9999999999999,9999999999999,9999999999999,9999999999999,9999999999999,1.066731,1.035865,6.273963,4.466573,3.761098,1.336298


In [9]:
import numpy as np
vif_selection = selectDataUsingVIF(myopia_dummy, 5)
calculateVIF(vif_selection)



Unnamed: 0,GENDER,SPHEQ,ACD,LT,VCD,SPORTHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,AGE_7,AGE_8,AGE_9
VIF,1.300187,1.211911,1.308197,1.435744,1.490542,1.087364,1.656589,2.352737,1.456505,3.37737,1.047097,1.035732,1.0802,1.413435,1.051193


In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

var_sk = SelectKBest(f_classif, k = 8)
var_sk.fit_transform(vif_selection, myopia[target])

k_selection = vif_selection.loc[:, var_sk.get_support()]

In [11]:
k_selection.head()

Unnamed: 0,GENDER,SPHEQ,ACD,LT,SPORTHR,MOMMY,DADMY,AGE_9
0,1,-0.052,3.69,3.498,45,1,1,0
1,1,0.608,3.702,3.392,4,1,1,0
2,1,1.179,3.462,3.514,14,0,0,0
3,1,0.525,3.862,3.612,18,0,1,0
4,0,0.697,3.676,3.454,14,1,0,0


In [12]:
from sklearn.model_selection import train_test_split

features = list(k_selection.columns)

x_train, x_test, y_train, y_test = train_test_split(k_selection, myopia[target],  random_state = 0)

## Metric Function

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve

def metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)

    print(u'Confusion Matrix \n', cm)

    print(u'Accurancy:', accuracy_score(y_true, y_pred))
    print(u'Precision:', precision_score(y_true, y_pred))
    print(u'Recall:', recall_score(y_true, y_pred))

    false_positive_rate, recall, thresholds = roc_curve(y_true, y_pred)
    roc_auc = auc(false_positive_rate, recall)

    print(u'AUC:', auc(false_positive_rate, recall))


### Logistic Regression

In [14]:
from sklearn.linear_model.logistic import LogisticRegression

model = LogisticRegression().fit(x_train, y_train)
y_pred_train = model.predict(x_train)

metrics(y_train, y_pred_train)
print(x_train.columns)

Confusion Matrix 
 [[397   7]
 [ 39  20]]
Accurancy: 0.9006479481641468
Precision: 0.7407407407407407
Recall: 0.3389830508474576
AUC: 0.6608281590870951
Index(['GENDER', 'SPHEQ', 'ACD', 'LT', 'SPORTHR', 'MOMMY', 'DADMY', 'AGE_9'], dtype='object')




In [15]:
y_pred_test = model.predict(x_test)
metrics(y_test, y_pred_test)

Confusion Matrix 
 [[130   3]
 [ 16   6]]
Accurancy: 0.8774193548387097
Precision: 0.6666666666666666
Recall: 0.2727272727272727
AUC: 0.6250854408749146


In [22]:
import numpy as np
a=np.array([[ 0 , 0.396, 3.304 ,3.654 ,8,1, 1, 0 ]])
print(model.predict_proba(a))
ans=model.predict_proba(a)
print(x_train.columns)

[[0.77754347 0.22245653]]
Index(['GENDER', 'SPHEQ', 'ACD', 'LT', 'SPORTHR', 'MOMMY', 'DADMY', 'AGE_9'], dtype='object')


In [19]:
pickle.dump(model, open("fin", 'wb'))
