In [21]:
import pandas as pd
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler
import xgboost
import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression

In [22]:
df_train = pd.read_csv("data/train/100.csv",sep="\t")
df_test = pd.read_csv("data/test/100.csv",sep="\t")

In [23]:
df_train.head(10)

Unnamed: 0,cellID,f_l1_College & University,f_l1_Food,f_l1_Residence,f_l1_Travel & Transport,f_l1_Outdoors & Recreation,f_l1_Arts & Entertainment,f_l1_Shop & Service,f_l1_Nightlife Spot,f_l1_Professional & Other Places,f_l1_Event,t_predominant
0,2376,0,4,0,0,0,1,4,0,16,0,Commercial
1,6266,0,1,0,0,3,0,0,0,0,0,OpenSpace
2,4023,1,0,2,6,0,0,2,0,3,0,Transportation
3,5242,0,0,0,0,0,0,5,0,1,0,Residential
4,89,0,0,0,0,1,0,0,0,0,0,Commercial
5,3468,0,3,0,0,1,2,2,0,3,0,OpenSpace
6,1197,0,6,6,1,0,3,0,0,3,0,Residential
7,2463,3,5,3,2,3,0,5,0,7,0,Mixed
8,403,0,11,0,0,0,1,1,2,7,0,Commercial
9,4648,0,0,1,0,0,0,0,0,0,0,OpenSpace


In [24]:
df_X_train = df_train[[x for x in df_train.columns if x.startswith("f_")]]
df_y_train = df_train[[x for x in df_train.columns if x.startswith("t_")]]

df_X_test = df_train[[x for x in df_test.columns if x.startswith("f_")]]
df_y_test = df_train[[x for x in df_test.columns if x.startswith("t_")]]

In [25]:
# scaler
X_scaler = StandardScaler()

# scale train feature
df_X_train_scaled = X_scaler.fit_transform(df_X_train)
df_X_train_scaled = pd.DataFrame(df_X_train_scaled, index=df_X_train.index, columns=df_X_train.columns)

# scale test feature
df_X_test_scaled = X_scaler.transform(df_X_test)
df_X_test_scaled = pd.DataFrame(df_X_test_scaled, index=df_X_test.index, columns=df_X_test.columns)

# merge label and feature
df_train_scaled = df_X_train_scaled.merge(df_y_train,left_index=True, right_index=True)
df_test_scaled = df_X_test_scaled.merge(df_y_test,left_index=True, right_index=True)

In [26]:
df_X_train_scaled.head()

Unnamed: 0,f_l1_College & University,f_l1_Food,f_l1_Residence,f_l1_Travel & Transport,f_l1_Outdoors & Recreation,f_l1_Arts & Entertainment,f_l1_Shop & Service,f_l1_Nightlife Spot,f_l1_Professional & Other Places,f_l1_Event
0,-0.241317,0.387489,-0.727389,-0.676537,-0.764213,0.164485,0.310932,-0.508204,3.140005,-0.124595
1,-0.241317,-0.430013,-0.727389,-0.676537,1.601896,-0.399978,-0.721991,-0.508204,-0.80713,-0.124595
2,0.378507,-0.702513,0.400513,2.61274,-0.764213,-0.399978,-0.205529,-0.508204,-0.067043,-0.124595
3,-0.241317,-0.702513,-0.727389,-0.676537,-0.764213,-0.399978,0.569163,-0.508204,-0.560435,-0.124595
4,-0.241317,-0.702513,-0.727389,-0.676537,0.02449,-0.399978,-0.721991,-0.508204,-0.80713,-0.124595


In [27]:
df_y_train.head(10)

Unnamed: 0,t_predominant
0,Commercial
1,OpenSpace
2,Transportation
3,Residential
4,Commercial
5,OpenSpace
6,Residential
7,Mixed
8,Commercial
9,OpenSpace


In [30]:
df = {}
dfs = []

print "****** BASELINE ******"
df['model'] = 'baseline'
df_train['t_predominant'].value_counts().max()
y_pred = [df_train['t_predominant'].value_counts().idxmax()] * len(df_y_test)
df['f1-score'] = metrics.f1_score(df_y_test.values, y_pred,average='macro')
df['accuracy'] =  round(metrics.accuracy_score(df_y_test.values, y_pred),3) * 100
df['precision'] = metrics.recall_score(df_y_test.values, y_pred,average='macro')
df['recall'] = metrics.precision_score(df_y_test.values, y_pred,average='macro')
dfs.append(df)
print df

print "**********************"

****** BASELINE ******
{'recall': 0.053078556263269634, 'model': 'baseline', 'f1-score': 0.080515297906602251, 'precision': 0.16666666666666666, 'accuracy': 31.8}
**********************


In [31]:
# xgboost Classifier
print "****** XGBOOST ******"
df = {}
clf = xgboost.XGBClassifier()
clf.fit(df_X_train.as_matrix(), df_y_train.values)
y_pred = clf.predict(df_X_test.as_matrix())
df['model'] = 'xgBoost'
df['accuracy'] = round(metrics.accuracy_score(df_y_test.values, y_pred),3) * 100
df['precision'] = metrics.recall_score(df_y_test.values, y_pred,average='macro')
df['recall'] = metrics.precision_score(df_y_test.values, y_pred,average='macro')
dfs.append(df)

print df
print "**********************"

****** XGBOOST ******


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'recall': 0.59846296922935138, 'model': 'xgBoost', 'precision': 0.51316705173141031, 'accuracy': 59.199999999999996}
**********************


In [32]:
#Logistic Regression
print "****** Logistic Regression ******"
df = {}
clf = LogisticRegression()
clf.fit(df_X_train.as_matrix(), df_y_train.values)
y_pred = clf.predict(df_X_test.as_matrix())
df['model'] = 'LogReg'
df['f1-score'] = metrics.f1_score(df_y_test.values, y_pred,average='macro') #metrics.accuracy_score(df_y_test.values, y_pred)
df['accuracy'] =  round(metrics.accuracy_score(df_y_test.values, y_pred),3) * 100
df['precision'] = metrics.recall_score(df_y_test.values, y_pred,average='macro')
df['recall'] = metrics.precision_score(df_y_test.values, y_pred,average='macro')
dfs.append(df)

print df
print "**********************"

****** Logistic Regression ******
{'recall': 0.53403078393049486, 'model': 'LogReg', 'f1-score': 0.44173886895974129, 'precision': 0.44770262853650961, 'accuracy': 53.1}
**********************


  y = column_or_1d(y, warn=True)


In [36]:
results = pd.DataFrame(dfs)
results = results[['model','accuracy','f1-score','precision','recall']]

In [37]:
results

Unnamed: 0,model,accuracy,f1-score,precision,recall
0,baseline,31.8,0.080515,0.166667,0.053079
1,xgBoost,59.2,,0.513167,0.598463
2,LogReg,53.1,0.441739,0.447703,0.534031
