In [1]:
# import all pre-requisite data manipulation libraries

import pandas as pd, numpy as np, re

In [2]:
# load the training data

df =pd.read_json('train_data.json')

# on examining the dataset we see that we need to take a transpose of it (just do df.head() and examine yourself as well)

df = df.T

In [3]:
# need to encode target variables to 1: positive or 0: negative

def map(val):
    if val=='pos':return 1
    return 0
df['segment'] = df['segment'].map(map)

In [5]:
# just checking the amount of each segment

df.segment.value_counts() ##so many negative segments

0    184745
1     15255
Name: segment, dtype: int64

In [6]:
## check stackoverflow for re.sub use
## also https://docs.python.org/2/library/re.html

## check the above mentioned resources for more information about how re library works

## general prpose function to extract one-hot encoded feature values for mentioned dataframe columns 

def get_features(dataframe):
    refined  = [re.sub('\:\d+','',i).split(',') for i in dataframe ]
    get_columns = list(set([i for j in refined for i in j]))
    prepare = pd.DataFrame(columns = get_columns)
    for i in get_columns:
        f1 = []
        for j in refined:
            if i in j:f1.append(1)
            else: f1.append(0) 
        prepare[i] = f1
    return prepare

In [37]:
# we can get one hot encoding values for days_of_week, genre and time_of_day which could be useful for analysis

Genres = get_features(df['genres'])
Days   = get_features(df['dow'])
Hours = get_features(df['tod'])


## rename the Days dataframe columns for better understading (numbers dont seem good to comprehend sometimes)

Days.rename(columns = {'1':'Monday','2':'Tuesday','3':'Wednesday','4':'Thursday','5':'Friday','6':'Saturday','7':'Sunday'},inplace = True)

In [38]:
# checking did it worked ? yes it did! ^_^!

Genres.head()

Unnamed: 0,Travel,Kabaddi,Crime,Romance,LiveTV,Hockey,FormulaE,Comedy,Teen,Cricket,...,Drama,Action,Athletics,Reality,Documentary,Swimming,Formula1,Family,Badminton,Sport
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,1,0,0,1,0,0,0,0,0,0


In [None]:
## we need numerical data about total time spent we can get from genres important for analysis of "segment" 
## refer to:   https://docs.python.org/2/library/re.html     (for info about regex library used here )

total_time = []
for i in df.genres:    
    total_time.append(sum([int(re.sub('.*\:','',j)) for j in i.split(',')]))


In [40]:
## making a dataframe for the Total_time feature

Total_times = pd.DataFrame(total_time,columns=['Time_given'])

In [41]:
## Time to combine all the features we made 

Genres = Genres.join(Days).join(Hours).join(Total_times)

In [42]:
## lets have a look at the training features 

Genres.head()

Unnamed: 0,Travel,Kabaddi,Crime,Romance,LiveTV,Hockey,FormulaE,Comedy,Teen,Cricket,...,10,13,12,15,14,17,16,19,18,Time_given
0,0,1,0,0,0,0,0,0,0,1,...,1,1,1,1,1,1,1,1,1,87385
1,0,0,0,0,0,0,0,0,0,1,...,1,0,0,1,0,0,1,1,1,16370
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4142
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,8348
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,1,0,1,1,10131


In [43]:
## can we get anymore info out of the train data?? [titles, cities]

## cities area lot (around 1500 unique cities) cant make one hot encoding so we can take there count into consideration
## also titles of shows are a lot too so avoid making too many features (more features -> overfitting!!!)

def get_count_features(dataframe):
    return [len(re.sub('\:\d+','',i).split(',')) for i in dataframe]

Genres['cities_count'] = get_count_features(df['cities'])
Genres['titles_count'] = get_count_features(df['titles'])

In [45]:
## lets have a look at our final features for training

Genres.head()

Unnamed: 0,Travel,Kabaddi,Crime,Romance,LiveTV,Hockey,FormulaE,Comedy,Teen,Cricket,...,12,15,14,17,16,19,18,Time_given,cities_count,titles_count
0,0,1,0,0,0,0,0,0,0,1,...,1,1,1,1,1,1,1,87385,2,60
1,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,1,1,1,16370,3,70
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,4142,1,2
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,8348,3,8
4,0,0,0,0,0,0,0,0,0,1,...,0,1,0,1,0,1,1,10131,2,11


In [46]:
##  final storage of training and target information

train = Genres
y = pd.DataFrame(df.segment,columns = ['segment'])

In [47]:
## save this dataframe for re-use  

train.to_csv('train_data.csv',index=False)
y.to_csv('target_val.csv',index=False)

#train = pd.read_csv('train_data.csv')
#y     = pd.read_csv('target_val.csv')

In [48]:
## lets load the test data (same procedure as for train_data.json)

test = pd.read_json('test_data.json')
test = test.T

In [49]:
## Repeat the above mentioned complete procedure for test data as well

Genres_test = get_features(test['genres'])
Days_test   = get_features(test['dow']) 
total_time_test = []
for i in test.genres:    
    total_time_test.append(sum([int(re.sub('.*\:','',j)) for j in i.split(',')]))
Total_times_test = pd.DataFrame(total_time_test,columns=['Time_given'])
Days_test.rename(columns = {'1':'Monday','2':'Tuesday','3':'Wednesday','4':'Thursday','5':'Friday','6':'Saturday','7':'Sunday'},inplace = True)
Hours_test = get_features(test['tod'])
Genres_test = Genres_test.join(Days_test).join(Hours_test).join(Total_times_test)
Genres_test['cities_count'] = get_count_features(test['cities'])
Genres_test['titles_count'] = get_count_features(test['titles'])

In [50]:
## Here is the test data final!!

test_data = Genres_test

## lets have a look!

test_data.head()

Unnamed: 0,Travel,Kabaddi,Crime,Romance,LiveTV,Hockey,FormulaE,Comedy,Teen,Cricket,...,12,15,14,17,16,19,18,Time_given,cities_count,titles_count
0,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,884,2,8
1,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,0,23851,3,10
2,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,13873,3,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,0,6735,3,4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1204,1,1


In [58]:
## Time to train our ML model

## import pre-requisite libraries 

from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score

## split the data for testing purposes 70-30 ratio

X_train,X_test,y_train,y_test = train_test_split(train.values,y.values.ravel(),test_size = 0.3,stratify=y)

## normlize the features before using (good practice)

scaler = StandardScaler().fit(X_train)
X_train=scaler.transform(X_train)

## define the classifier with suitable parameters

rfc = RFC(n_jobs=-1,max_features= 'auto' ,n_estimators=550, max_depth = 10, min_samples_split = 10, oob_score = True)

'''
## Takes a lot of time to use exhaustive search to find the best parameter combination (using manual tuning instead)


params = [
    {'n_estimators':[500,550],
     #      'max_features':['auto','sqrt','log2'],
     'max_depth':[5,7,9,12],
     'min_samples_split':[10,11,12]
    }
]

gcv = GridSearchCV(estimator=rfc, param_grid= params, cv=5)

'''

## fit the training data

rfc.fit(X_train,y_train)

## predict the probablistic value for "pos" segment

y_pred = rfc.predict_proba(X_test)[:,1]

## test for AUC

print roc_auc_score(y_test, y_pred)

0.655823992964


In [52]:
## preparing the final target values for the Test data

rfc.fit(train.values,y.values.ravel())
y_final = rfc.predict_proba(test_data.values)[:,1]

In [54]:
## preparing the submission.csv file for real evaluation

ID = test.T.columns
submission = pd.DataFrame(columns=['ID','segment'])
submission['ID'], submission['segment'] = ID, y_final
submission.to_csv('submission.csv',index = False)