### Tokenized Model

In [194]:
#python example to infer document vectors from trained doc2vec model
import gensim.models as g
import codecs
from scipy import spatial
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor

from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb


In [277]:
def train_test_split(data,labels,n_class,tr_cut):
    data_train = data[:round(tr_cut*(data.shape[0]))]
    labels_train = labels[:round(tr_cut*(labels.shape[0]))]
    data_test = data[round(tr_cut*(data.shape[0])):]
    labels_test = labels[round(tr_cut*(labels.shape[0])):]
    class_weight=compute_class_weight('balanced', np.arange(0,n_class), labels)
    cw_dict = {}
    for i in range(len(class_weight)):
        cw_dict[i] = class_weight[i]
    return data_train,labels_train,data_test,labels_test,cw_dict
        
def log_reg_model(data_train,labels_train,data_test,labels_test,n_class,cw_dict):
    if n_class>2:
        log_reg = LogisticRegression(multi_class='multinomial',solver='lbfgs',n_jobs=-1,class_weight=cw_dict)
    else:
        log_reg = LogisticRegression(class_weight=cw_dict,n_jobs=-1,solver='lbfgs')
    log_reg.fit(data_train,labels_train)
    print("Training accuracy-")
    print(accuracy_score(labels_train,log_reg.predict(data_train)))
    print("Testing accuracy-")
    print(accuracy_score(labels_test,log_reg.predict(data_test)))
    print("Confusion matrix -")
    print(confusion_matrix(labels_test,log_reg.predict(data_test)))
    print('true negatives is C[0,0] , false negatives is C[1,0] , true positives is C[1,1]  and false positives is C[0,1]')
    print("Precision and recall - ")
    print(precision_score(labels_test,log_reg.predict(data_test)))
    print(recall_score(labels_test,log_reg.predict(data_test)))
    return log_reg

def rf_model(data_train,labels_train,data_test,labels_test,class_weight):
    rand_f_mdl = RandomForestClassifier(class_weight=class_weight,max_depth=40,max_features=240,n_estimators=200,n_jobs=-1)
    rand_f_mdl.fit(data_train,labels_train)
    print("Training accuracy-")
    print(accuracy_score(labels_train,rand_f_mdl.predict(data_train)))
    print("Testing accuracy-")
    print(accuracy_score(labels_test,rand_f_mdl.predict(data_test)))
    print("Confusion matrix -")
    print(confusion_matrix(labels_test,rand_f_mdl.predict(data_test)))
    print('true negatives is C[0,0] , false negatives is C[1,0] , true positives is C[1,1]  and false positives is C[0,1]')
    print("Precision and recall - ")
    print(precision_score(labels_test,rand_f_mdl.predict(data_test)))
    print(recall_score(labels_test,rand_f_mdl.predict(data_test)))
    return rand_f_mdl

def threshold(x):
    labels=[]
    for i in x:
        if i>=.55:       
            labels.append(1)
        else:  
            labels.append(0)
    return labels




def light_gbm_model(data_train,labels_train,data_test,labels_test,class_weight):
    d_train = lgb.Dataset(data_train, label=labels_train)
    params = {}
    params['learning_rate'] = 0.003
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'binary'
    params['metric'] = 'binary_logloss'
    params['sub_feature'] = 0.5
    params['num_leaves'] = 40
    params['min_data'] = 50
    params['max_depth'] = 50
    params['is_unbalance'] = True
    #params['class_weight']=class_weight
    clf = lgb.train(params, d_train, 1000)

    
    print("Training accuracy-")
    print(accuracy_score(labels_train,threshold(clf.predict(data_train))))
    print("Testing accuracy-")
    print(accuracy_score(labels_test,threshold(clf.predict(data_test))))
    print("Confusion matrix -")
    print(confusion_matrix(labels_test,threshold(clf.predict(data_test))))
    print('true negatives is C[0,0] , false negatives is C[1,0] , true positives is C[1,1]  and false positives is C[0,1]')
    print("Precision and recall - ")
    print(precision_score(labels_test,threshold(clf.predict(data_test))))
    print(recall_score(labels_test,threshold(clf.predict(data_test))))
    
    return clf




def check_region(x):
    China_participants = np.array(csv.loc[(csv['Country']=='China'),['ParticipantID']])
    China_participants = [str(i[0]) for i in China_participants]
    RoW_participants = np.array(csv.loc[(csv['Country']!='China') & (csv['Country']!='-'),['ParticipantID']])
    RoW_participants =  [str(i[0]) for i in RoW_participants]
    if x in China_participants:
        return 0
    elif x in RoW_participants:
        return 1
    else:
        return 2
    
def get_stats_csv():
    contest_list = ['Contest1236','Contest1243','Contest1244','Contest1245','Contest1248']
    csv_list=[]
    for i in contest_list:
        csv_list.append(pd.read_csv('../Data/'+str(i)+'/standings_statistics.csv',sep='\t'))
    csv = pd.concat(csv_list,sort=True)
    return csv
    
    
    
    
    
def check_rating(x):
    #print('Looking up rating....')

    blue_participants = np.array(csv.loc[(csv['Color']=='blue'),['ParticipantID']])
    blue_participants = [str(i[0]) for i in blue_participants]
    cyan_participants = np.array(csv.loc[(csv['Color']=='cyan'),['ParticipantID']]) #& (csv['Country']!='-'),['ParticipantID']])
    cyan_participants =  [str(i[0]) for i in cyan_participants]
    violet_participants = np.array(csv.loc[(csv['Color']=='violet'),['ParticipantID']]) #& (csv['Country']!='-'),['ParticipantID']])
    violet_participants =  [str(i[0]) for i in violet_participants]
    #print("   rating lists generated")
    if x in blue_participants:
        return 0
    elif x in violet_participants:
        return 1
    else :
        return 2

def prepare_data(g_truth,stopwords_flag):
    cols=['i', '=', 'int', '#', '1', '0', 'n', '.', 'define', '<', 'a', 'x', '<<', '+', 'if', '-', 'll', '()', '++', '>', 'for', 'include', 'b', 'return', 'j', 'long', '>>', '==', '*', '2', 'v', '&', 'c', 'y', 'k', 'ans', 'cin', 'cout', '<=', '%', 'const', 'm', 'T', 'p', 's', 'using', 't', 'std', 'vector', 'else', 'u', 'typedef', 'd', '+=', 'namespace', 'r', 'while', 'N', 'main', 'l', '3', 'f', 'endl', 'void', '/', '::', 'push_back', 'first', 'second', 'size', '<bits/stdc++.h>', 'mod', 'dp', '--', ':', '!=', 'res', '&&', 'pair', 'scanf', 'printf', 'template', 'e', 'cnt', 'w', 'pb', 'tie', 'q', 'ch', 'typename', 'g', 'bool', 'it', 'auto', '>=', 'false', 'sum', 'LL', 'max', 'inline', '!', 'sync_with_stdio', 'double', 'maxn', 'class', 'char', 'long long', 'read', 'MOD', 'begin', 'A', '10', 'break;', 'end', 'rep', '" "', '"%d"', "'\\n'", '...', 'mp', 'pii', 'dfs', '?', 'tmp', 'min', 'getchar', "'0'", 'st', 'sort', 'continue;', 'string', '||', 'cost', '7', '1e9', 'make_pair', 'z', 'solve', 'operator', '"\\n"', '-=', 'all', 'pos', 'ostream', 'os', 'S', 'sz', '<iostream>', 'abs', 'M', '<algorithm>', 'ret', 'ios_base', 'MAXN', 'now', 'arr', 'col', 'ld', 'val', 'true', 'cur', '4', 'find', 'to', 'INF', 'num', "' '", 'vis', 'temp', 'fi', 'vi', 'endif', '5', 'mn', 'se', 'edge', 'out', '1000000007', 'insert', 'gcd', 'F', '<vector>', 'ios', 'B', 'freopen', 'ss', 'R', 'fa', 'par', '<cstdio>', 'P', '%=', 'L', 'id', 'pll', 'root', '1e5', 'inf', '/=', 'pragma', 'mid', "'1'", 'mx', 'C', 'set', 'tot', 'node', "'9'", 'X', 'h', 'adj', '<queue>', 'c2', "'-'", '<map>', 'swap', '<set>', 'c1', 'in', '<cmath>', 'struct', '"%lld\\n"', 'GCC', 'ff', '6', 'signed', 'edges', '100005', '<cstring>', 'stdin', 'G', 'p1', 'FOR', 'vec', '"%lld"', 'return;', 'NULL', 'REP', 'cerr', 'ans1', '1ll', 'o', 'ifdef', 'V', 'K', 'push', 'str', 'erase', '->', '<string>', 'pr', 'U', 'ull', 'color', 'unsigned', '"r"', 'flag', 'ii', '"%d\\n"', 'count', '*=', 'top', 'xx', 'optimize', 'pre', 'get', 'pi', '1e18', 'ans2', 'E', 'puts', 'dist', 'Y', 'nxt', 'fr', 'T1', 'p2', 'len', 'map', 'empty', '[]', 'x1', '1LL', 'lli', 'parent', 'dis', 'add', 'sc', 's1', '<stack>', 'T2', 'print', 'resize', 'stdout', 'used', 'lower_bound', 'head', 'idx', 'PI', 'long double', 'sizeof', 'vll', 'deg', '^', '"-1"', "'('", 're', '", "', 'isdigit', 'debug', 'curr', 'ifndef', 'tree', 'memset', 't1', 'mi', 'a1', 'pair<int,int>', 's2', 'args', '>>=', 'istream', 'dir', 'ind', 'to_string', 'left', '"%d%d"', 'ONLINE_JUDGE', 'putchar', 'eps', '"w"', 'fib', '"-1\\n"', 'power', 'pop', 'right', 'yy', 'upper_bound', 'sum1', 'back', 'ar', 'MAX', 'register', 'names', '<iomanip>', '"%d %d\\n"', 'db', 't2', 'sum2', 'Edge', '""', '<bitset>', 'ok', 'emplace_back', 'pq', '__gcd', 'check', 'start', 'cnt1', 'rt', 'vl', '48', 'x2', 'graph', 'cnt2', 'y1', '0x3f3f3f3f', 'a2', '"%d "', 'tt', 'row', 'cmp', '"input.txt"', 'Q', 'md', '{}', 'int32_t', 'init', '__gnu_pbds', 'D', '2005', 'Args', 'from', 'write', 'base', 'last', 'Arg1', 'pp', 'acos', '<unordered_map>', 'ed', '<ext/pb_ds/assoc_container.hpp>', 'next', 'nullptr', '1e6', 'prime', 'front', 'long long int', 'forn', '100010']
    if g_truth=='rating':
        contest_list = ['Contest1236','Contest1243','Contest1244','Contest1245','Contest1248']
        token_data_list=[]
        for i in contest_list:
            token_data_list.append(pd.read_csv('../Data/'+str(i)+'/tokens_stats.txt',sep=' ',header=None,skiprows=1,names=cols))
        token_data = pd.concat(token_data_list)
        token_data = (token_data - token_data.mean())/token_data.std()
        print("core token data generated")
        print(token_data.shape)
        code_files = list(token_data.index)
        pid = [ i.split('_')[0] for i in code_files]
        token_data_2 = token_data.assign(pid=pid) 
        print("participant_id added")
        print("looking up rating....")
        token_data_2['rating'] = token_data_2['pid'].apply(check_rating)
        print("rating looked-up successfully")
        if stopwords_flag:
            stopwords=['ans', 'm', 'T', 'p', 's', 't',  'u',  'd',  'r',  'N',  'l', '3', 'f', 'dp', 'res',  'e', 'cnt', 'w', 'q', 'ch','g', 'it',   'A', '10', 'tmp', 'st', '7', '1e9',  'z',  'all', 'S', 'sz',  'M', 'MAXN', 'now', 'arr', 'col',  'val', 'cur', '4', 'num',  'vis', 'temp', '5', 'mn', 'se', 'F','B', 'ss', 'R', 'fa', 'par', 'P', 'L', 'id', 'root', '1e5', 'inf', 'mid', "'1'", 'mx', 'C', 'tot', 'node', "'9'", 'X', 'h', 'adj', 'c2', 'c1',  '6',  'edges', '100005', 'G', 'p1', 'vec','ans1','o', 'V', 'K', 'pr', 'color','flag',  'count','top', 'xx',  'pre', 'ans2', 'E', 'dist', 'Y', 'T1', 'p2', 'len', 'x1','parent', 'sc', 's1',  'T2',   'head', 'idx', 'PI', 'deg', '^', '"-1"', "'('", '", "', 'curr', 'tree',  't1', 'mi', 'a1','s2', 'dir', 'ind',  'left',  'eps', '"w"', 'fib', '"-1\\n"',  'right', 'yy', 'sum1', 'ar' , 'names', 'db', 't2', 'sum2', 'Edge', '""', 'ok', 'check', 'start', 'cnt1', 'rt', 'vl', '48', 'x2', 'graph', 'cnt2', 'y1','0x3f3f3f3f','a2',  'tt', 'row', 'cmp', '"input.txt"', 'Q', 'md',  'D', '2005', 'Args', '1e6',  'forn', '100010','k','c','x','i']
            for i in stopwords:
                token_data_2.drop(i,axis=1,inplace=True)
            print("stop words removed")
        
        token_data_2 = token_data_2[token_data_2.rating != 2]
        token_data_2.drop('pid',axis=1,inplace=True)
        col_red = token_data_2.columns
        x = token_data_2.iloc[:,:token_data_2.shape[1]-1]
        y = token_data_2.iloc[:,token_data_2.shape[1]-1:]
        y = np.array(y)
        y = [y[i][0] for i in range(len(y))]
        y=np.array(y)
        return x,y,col_red
    elif g_truth=='country':
        contest_list = ['Contest1236','Contest1243','Contest1244','Contest1245','Contest1248']
        token_data_list=[]
        for i in contest_list:
            token_data_list.append(pd.read_csv('../Data/'+str(i)+'/tokens_stats.txt',sep=' ',header=None,skiprows=1,names=cols))
        token_data = pd.concat(token_data_list)
        token_data = (token_data - token_data.mean())/token_data.std()
        print("core token data generated")
        print(token_data.shape)
        code_files = list(token_data.index)
        pid = [ i.split('_')[0] for i in code_files]
        token_data_2 = token_data.assign(pid=pid) 
        print("participant_id added")
        print("looking up region...........")
        token_data_2['region'] = token_data_2['pid'].apply(check_region)
        print("region looked up")
        if stopwords_flag:
            stopwords=['ans', 'm', 'T', 'p', 's', 't',  'u',  'd',  'r',  'N',  'l', '3', 'f', 'dp', 'res',  'e', 'cnt', 'w', 'q', 'ch','g', 'it',   'A', '10', 'tmp', 'st', '7', '1e9',  'z',  'all', 'S', 'sz',  'M', 'MAXN', 'now', 'arr', 'col',  'val', 'cur', '4', 'num',  'vis', 'temp', '5', 'mn', 'se', 'F','B', 'ss', 'R', 'fa', 'par', 'P', 'L', 'id', 'root', '1e5', 'inf', 'mid', "'1'", 'mx', 'C', 'tot', 'node', "'9'", 'X', 'h', 'adj', 'c2', 'c1',  '6',  'edges', '100005', 'G', 'p1', 'vec','ans1','o', 'V', 'K', 'pr', 'color','flag',  'count','top', 'xx',  'pre', 'ans2', 'E', 'dist', 'Y', 'T1', 'p2', 'len', 'x1','parent', 'sc', 's1',  'T2',   'head', 'idx', 'PI', 'deg', '^', '"-1"', "'('", '", "', 'curr', 'tree',  't1', 'mi', 'a1','s2', 'dir', 'ind',  'left',  'eps', '"w"', 'fib', '"-1\\n"',  'right', 'yy', 'sum1', 'ar' , 'names', 'db', 't2', 'sum2', 'Edge', '""', 'ok', 'check', 'start', 'cnt1', 'rt', 'vl', '48', 'x2', 'graph', 'cnt2', 'y1','0x3f3f3f3f','a2',  'tt', 'row', 'cmp', '"input.txt"', 'Q', 'md',  'D', '2005', 'Args', '1e6',  'forn', '100010','k','c','x','i']
            for i in stopwords:
                token_data_2.drop(i,axis=1,inplace=True)
            print("stop words removed")
        token_data_2 = token_data_2[token_data_2.region != 2]
        token_data_2.drop('pid',axis=1,inplace=True)
        col_red = token_data_2.columns
        x = token_data_2.iloc[:,:token_data_2.shape[1]-1]
        y = token_data_2.iloc[:,token_data_2.shape[1]-1:]
        y = np.array(y)
        y = [y[i][0] for i in range(len(y))]
        y=np.array(y)
        return x,y,col_red
       
        


## Ground Truth = Rating

In [214]:
csv=get_stats_csv()
x_rating,y_rating,token_col=prepare_data('rating',False)   #(g_truth,stopwords_flag)

core token data generated
(17866, 400)
participant_id added
looking up rating....
rating looked-up successfully


In [230]:
x_train_rating,y_train_rating,x_test_rating,y_test_rating,class_weight = train_test_split(x_rating,y_rating,2,0.6)

#### random forest

In [274]:
rf = rf_model(x_train_rating,y_train_rating,x_test_rating,y_test_rating,class_weight)

Training accuracy-
1.0
Testing accuracy-
0.7777978993118435
Confusion matrix -
[[4259   30]
 [1197   36]]
true negatives is C[0,0] , false negatives is C[1,0] , true positives is C[1,1]  and false positives is C[0,1]
Precision and recall - 
0.5454545454545454
0.029197080291970802


#### logistic regression

In [275]:
lg_rating = log_reg_model(x_train_rating,y_train_rating,x_test_rating,y_test_rating,2,class_weight)

Training accuracy-
0.7386528247223564
Testing accuracy-
0.6218761318362912
Confusion matrix -
[[3053 1236]
 [ 852  381]]
true negatives is C[0,0] , false negatives is C[1,0] , true positives is C[1,1]  and false positives is C[0,1]
Precision and recall - 
0.23562152133580705
0.30900243309002434


#### light gbm

In [276]:
lgbm_rating = light_gbm_model(x_train_rating,y_train_rating,x_test_rating,y_test_rating,class_weight)

Training accuracy-
0.8891839690970545
Testing accuracy-
0.7142339731981167
Confusion matrix -
[[3720  569]
 [1009  224]]
true negatives is C[0,0] , false negatives is C[1,0] , true positives is C[1,1]  and false positives is C[0,1]
Precision and recall - 
0.28247162673392184
0.1816707218167072


## Ground Truth = Country

In [278]:
x_country,y_country,token_col=prepare_data('country',False)

core token data generated
(17866, 400)
participant_id added
looking up region...........
region looked up


In [279]:
token_col

Index(['i', '=', 'int', '#', '1', '0', 'n', '.', 'define', '<',
       ...
       '<ext/pb_ds/assoc_container.hpp>', 'next', 'nullptr', '1e6', 'prime',
       'front', 'long long int', 'forn', '100010', 'region'],
      dtype='object', length=401)

In [280]:
x_train_country,y_train_country,x_test_country,y_test_country,class_weight_c = train_test_split(x_country,y_country,2,0.6)

#### random forest

In [281]:
rf_country = rf_model(x_train_country,y_train_country,x_test_country,y_test_country,class_weight_c)

Training accuracy-
1.0
Testing accuracy-
0.8879382303839732
Confusion matrix -
[[1122  427]
 [ 110 3133]]
true negatives is C[0,0] , false negatives is C[1,0] , true positives is C[1,1]  and false positives is C[0,1]
Precision and recall - 
0.880056179775281
0.9660807893925377


In [282]:
coeffecient_importance_country = rf_country.feature_importances_
f_imp_score_country={}
for i in range(len(token_col)-1):
    f_imp_score_country[token_col[i]]=coeffecient_importance_country[i]

In [283]:
import operator
sorted_f_imp_score_rating = sorted(f_imp_score_country.items(),key=operator.itemgetter(1),reverse=True)
sorted_f_imp_score_rating

[('k', 0.16099939627430163),
 ('long', 0.048093828691405986),
 ('.', 0.04258108545784607),
 ('c', 0.03630945226205444),
 ('-=', 0.034505163624088335),
 ('...', 0.02369108068659787),
 ('db', 0.022940893824566877),
 ('a', 0.013978165351564615),
 ('w', 0.013557520615786564),
 ('dp', 0.01268055918280591),
 ('j', 0.012458208625640045),
 ('else', 0.011289750360786756),
 ('g', 0.011140672911417715),
 ('<', 0.010579560597724974),
 ('int', 0.010182226420746808),
 ('const', 0.010056018304597918),
 ('read', 0.010010113246319348),
 ('<queue>', 0.009959371765016793),
 ('mod', 0.009375919558804038),
 ('template', 0.009070603569474202),
 ('INF', 0.008349947695541418),
 ('::', 0.008325913466250872),
 ('||', 0.008267725936044037),
 ('p', 0.007958964524650482),
 ('=', 0.007844552357684305),
 ('cost', 0.007766940169720817),
 ('()', 0.007743280338849684),
 ('r', 0.007607149456725876),
 ('1', 0.0072209921212926164),
 ('0', 0.006907284175746308),
 ('col', 0.0066157891461730515),
 ('auto', 0.0065929873145001

#### logistic regression

In [284]:
log_ref_country = log_reg_model(x_train_country,y_train_country,x_test_country,y_test_country,2,class_weight_c)

Training accuracy-
0.929465776293823
Testing accuracy-
0.8524624373956594
Confusion matrix -
[[1294  255]
 [ 452 2791]]
true negatives is C[0,0] , false negatives is C[1,0] , true positives is C[1,1]  and false positives is C[0,1]
Precision and recall - 
0.9162836506894287
0.8606228800493371


#### light gbm

In [285]:
lgbm_country = light_gbm_model(x_train_country,y_train_country,x_test_country,y_test_country,class_weight)

Training accuracy-
0.9293266555370061
Testing accuracy-
0.8971202003338898
Confusion matrix -
[[1399  150]
 [ 343 2900]]
true negatives is C[0,0] , false negatives is C[1,0] , true positives is C[1,1]  and false positives is C[0,1]
Precision and recall - 
0.9508196721311475
0.8942337341967315
