In [85]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from patsy import dmatrices

In [63]:
#Loading the csv files into panda dataframes,
#supplying column headers as there were  not present in the file
df = pd.read_csv('adult+stretch.data',names=['color','size','action','age','inflated'])

In [64]:
df

Unnamed: 0,color,size,action,age,inflated
0,YELLOW,SMALL,STRETCH,ADULT,T
1,YELLOW,SMALL,STRETCH,ADULT,T
2,YELLOW,SMALL,STRETCH,CHILD,F
3,YELLOW,SMALL,DIP,ADULT,F
4,YELLOW,SMALL,DIP,CHILD,F
5,YELLOW,LARGE,STRETCH,ADULT,T
6,YELLOW,LARGE,STRETCH,ADULT,T
7,YELLOW,LARGE,STRETCH,CHILD,F
8,YELLOW,LARGE,DIP,ADULT,F
9,YELLOW,LARGE,DIP,CHILD,F


In [65]:
#getting the size of the dataframe
df.shape

(20, 5)

In [66]:
#just doing some initial analysis, 
#checking the inflation rate
grp_inflated=pd.groupby(df,['inflated'])
grp_inflated.size()

inflated
F    12
T     8
dtype: int64

In [67]:
#As mentioned in the file test condition
#Adult can inflate the baloon,child always failes to inflate 
grp_age_inflated=pd.groupby(df,['age','inflated'])
grp_age_inflated.size()

age    inflated
ADULT  F           4
       T           8
CHILD  F           8
dtype: int64

In [68]:
#As mentioned in the file test condition
#Action STRETCH only can inflate the baloon
grp_action_inflated=pd.groupby(df,['action','inflated'])
grp_action_inflated.size()

action   inflated
DIP      F           8
STRETCH  F           4
         T           8
dtype: int64

In [69]:
#Formating the dataset to fit the the model

In [70]:
df['inflated'] = df['inflated'].map({ 'T':0,'F':1})

In [71]:
df['age'] = df['age'].map({ 'ADULT':1,'CHILD':2})

In [72]:
df['action'] = df['action'].map({ 'STRETCH':1,'DIP':2})

In [73]:
df['size'] = df['size'].map({ 'LARGE':1,'SMALL':2})

In [74]:
df['color'] = df['color'].map({ 'YELLOW':1,'PURPLE':2})

In [75]:
df

Unnamed: 0,color,size,action,age,inflated
0,1,2,1,1,0
1,1,2,1,1,0
2,1,2,1,2,1
3,1,2,2,1,1
4,1,2,2,2,1
5,1,1,1,1,0
6,1,1,1,1,0
7,1,1,1,2,1
8,1,1,2,1,1
9,1,1,2,2,1


In [76]:
y, X = dmatrices('inflated ~ color + size + action + age',
                  df, return_type="dataframe")


In [77]:
# flatten y into a 1-D array
y = np.ravel(y)
y

array([ 0.,  0.,  1.,  1.,  1.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  1.,
        1.,  1.,  0.,  0.,  1.,  1.,  1.])

In [78]:
X

Unnamed: 0,Intercept,color,size,action,age
0,1,1,2,1,1
1,1,1,2,1,1
2,1,1,2,1,2
3,1,1,2,2,1
4,1,1,2,2,2
5,1,1,1,1,1
6,1,1,1,1,1
7,1,1,1,1,2
8,1,1,1,2,1
9,1,1,1,2,2


In [21]:
#Going to fit a Logistic Regression Model as the response is binary 

In [82]:
log_model = LogisticRegression()
log_model.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [83]:
# check the accuracy on the training set
log_model.score(X, y)

0.90000000000000002

In [58]:
# examine the coefficients
pd.DataFrame(zip(X.columns, np.transpose(log_model.coef_)))

Unnamed: 0,0,1
0,Intercept,[0.587940229109]
1,color,[0.439491801743]
2,size,[0.439491801743]
3,action,[-1.1182868351]
4,age,[-1.1182868351]


In [81]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [60]:

# predict class labels for the test set
predicted = model2.predict(X_test)
print predicted

[ 0.  0.  0.  0.  1.  0.]


In [61]:
# generate class probabilities
probs = model2.predict_proba(X_test)
print probs


[[ 0.52705056  0.47294944]
 [ 0.55894099  0.44105901]
 [ 0.73086057  0.26913943]
 [ 0.67685387  0.32314613]
 [ 0.40271205  0.59728795]
 [ 0.57561186  0.42438814]]


In [62]:
# generate evaluation metrics
print metrics.accuracy_score(y_test, predicted)
print metrics.roc_auc_score(y_test, probs[:, 1])

0.833333333333
0.875
