# Project 1: Document author identification

In [1]:
# import warnings
# warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

## 1. Read the file and import data

In [3]:
import json

# read json file
def read_json(path):
    with open(path, 'r') as file:
         temp = json.load(file)
    df = pd.DataFrame(temp).T
    return df

In [4]:
train_df = read_json('comp90051-22-s1-p1/train.json')
train_df.head(5)

Unnamed: 0,venue,keywords,year,author
0,,"[64, 1, 322, 134, 136, 396, 270, 144, 476, 481...",2017,"[1605, 759]"
1,0.0,"[258, 260, 389, 261, 390, 396, 400, 17, 146, 2...",2013,[2182]
2,1.0,"[320, 454, 266, 462, 17, 339, 404, 342, 407, 2...",2007,[2176]
3,2.0,"[260, 132, 333, 15, 400, 272, 146, 401, 278, 3...",2013,[1107]
4,3.0,"[64, 385, 449, 450, 71, 73, 268, 80, 216, 25, ...",2009,[1414]


In [5]:
test_df = read_json('comp90051-22-s1-p1/test.json')
test_df.head(5)

Unnamed: 0,venue,keywords,year,coauthor,target
0,,"[260, 6, 390, 136, 7, 11, 17, 285, 288, 162, 4...",2017,[],988
1,94.0,"[260, 454, 137, 14, 400, 274, 339, 213, 280, 2...",2019,[1001],2123
2,31.0,"[390, 198, 7, 461, 462, 14, 404, 277, 24, 473,...",2014,[],1578
3,6.0,"[195, 6, 390, 10, 459, 464, 338, 146, 276, 466...",2010,[1347],2072
4,162.0,"[64, 1, 260, 457, 73, 147, 282, 27, 156, 43, 3...",2016,[1107],995


In [6]:
# print dimension for each dataframe
print(f"shape of train_df    = {train_df.shape}")
print(f"shape of test_df     = {test_df.shape}")

shape of train_df    = (26108, 4)
shape of test_df     = (2000, 5)


## 2. Create Label

In [7]:
def get_transformed_df(df):
    aus = df['author']
    transformed_lst = []
    for i in range(df.shape[0]):
        for j in aus[i]:
            temp = list(df.values[i][:-1])
            temp.append([au for au in aus[i] if au!=j])
            temp.append(j)
            transformed_lst.append(temp)
    cols = list(df.columns[:-1])
    cols.append('coauthor')
    cols.append('target')
    return pd.DataFrame(transformed_lst,columns=cols)

In [8]:
transformed_df = get_transformed_df(train_df)

In [9]:
transformed_df.head()

Unnamed: 0,venue,keywords,year,coauthor,target
0,,"[64, 1, 322, 134, 136, 396, 270, 144, 476, 481...",2017,[759],1605
1,,"[64, 1, 322, 134, 136, 396, 270, 144, 476, 481...",2017,[1605],759
2,0.0,"[258, 260, 389, 261, 390, 396, 400, 17, 146, 2...",2013,[],2182
3,1.0,"[320, 454, 266, 462, 17, 339, 404, 342, 407, 2...",2007,[],2176
4,2.0,"[260, 132, 333, 15, 400, 272, 146, 401, 278, 3...",2013,[],1107


## 3. One hot 

In [10]:
import torch
def get_onehot_col(col, size, col_prefix):
    transformed_col = []
    for i in col:
        transformed_col.append(torch.zeros(size, dtype=torch.int).scatter_(0, torch.tensor(i), value=1).tolist())
    col_names = [col_prefix+'_'+str(i) for i in range(size)]
    return pd.DataFrame(transformed_col, columns=col_names)

In [11]:
transformed_df.loc[transformed_df['venue']=='',['venue']]=470
venue_transformed = get_onehot_col(transformed_df['venue'], 471, "venue")

keywords_transformed = get_onehot_col(transformed_df['keywords'], 500, "keywords")

transformed_df['year'] = transformed_df['year']-2000
year_transformed = get_onehot_col(transformed_df['year'], 20, "year")

coauthor_transformed = get_onehot_col(transformed_df['coauthor'], 2302, "coauthor")

target_transformed = get_onehot_col(transformed_df['target'], 2302, "target")




In [12]:
year_transformed.head()

Unnamed: 0,year_0,year_1,year_2,year_3,year_4,year_5,year_6,year_7,year_8,year_9,year_10,year_11,year_12,year_13,year_14,year_15,year_16,year_17,year_18,year_19
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [13]:
transformed_df_comb = pd.concat([venue_transformed,keywords_transformed,year_transformed,coauthor_transformed],axis=1)

In [14]:
transformed_df_comb['target'] = transformed_df['target'].values

In [15]:
transformed_df_comb.head()

Unnamed: 0,venue_0,venue_1,venue_2,venue_3,venue_4,venue_5,venue_6,venue_7,venue_8,venue_9,...,coauthor_2293,coauthor_2294,coauthor_2295,coauthor_2296,coauthor_2297,coauthor_2298,coauthor_2299,coauthor_2300,coauthor_2301,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1605
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,759
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2182
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2176
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1107


## 2. Train test split

In [31]:
transformed_df_comb.shape

(48000, 3294)

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_val,y_train,   y_val = train_test_split(transformed_df_comb.iloc[:,:-1], transformed_df_comb['target'],test_size=0.2, random_state=90051)

print("X_train shape: {}, y_train shape: {}".format(X_train.shape, y_train.shape))
print("X_val   shape: {}, y_val   shape: {}".format(X_val.shape, y_val.shape))



X_train shape: (38400, 3293), y_train shape: (38400,)
X_val   shape: (9600, 3293), y_val   shape: (9600,)


In [24]:
X_val_pos = X_val
X_val_neg = X_val
y_val_pos = y_val
y_val_neg = pd.Series(np.random.choice([i for i in range(2302)], 14400))
X_val_comb = pd.concat([X_val_pos, X_val_neg],axis = 0)
y_val_comb = y_val_pos.append(y_val_neg, ignore_index=True)

true_label = [1 for i in range(X_val_pos.shape[0])]+[0 for i in range(X_val_pos.shape[0])]
print("X_val_comb shape: ", X_val_comb.shape)
print("y_val_comb shape: ", y_val_comb.shape)

X_val_comb shape:  (19200, 3293)
y_val_comb shape:  (24000,)


## 3. Model: Multinomal NB

In [18]:
from sklearn.metrics import roc_auc_score

In [20]:
from sklearn.naive_bayes import MultinomialNB
clf_NB = MultinomialNB()
clf_NB.fit(X_train, y_train)

MultinomialNB()

In [None]:
from sklearn.metrics import roc_auc_score

pred_NB = clf_NB.predict_proba(X_val_comb)

In [None]:
res = []
for i in range(len(pred_NB)):
    res.append(pred_NB[i][y_val_comb[i]])

In [26]:
roc_auc_score(true_label, res)  #0.783803759765625

0.783803759765625

## 3. Model: LR

In [27]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

- C=1

In [28]:
%%time
clf_LR_1 = LogisticRegression(multi_class="multinomial", C=1, verbose=True)
clf_LR_1.fit(X_train, y_train)      

pred_LR_1 = clf_LR_1.predict_proba(X_val_comb)
res_1 = []
for i in range(len(pred_LR_1)):
    res_1.append(pred_LR_1[i][y_val_comb[i]])

roc_auc_score(true_label, res_1) # 0.9471915310329861

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =      7582788     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.97275D+05    |proj g|=  1.36319D+02

At iterate   50    f=  6.66508D+04    |proj g|=  4.77983D+00

At iterate  100    f=  6.65236D+04    |proj g|=  6.42255D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
*****    100    108      1     0     0   6.423D-01   6.652D+04
  F =   66523.625209797581     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 27.5min finished


CPU times: user 1h 14min 39s, sys: 5min 4s, total: 1h 19min 44s
Wall time: 27min 40s


0.9471915310329861

- C=0.1

In [29]:
%%time
clf_LR_01 = LogisticRegression(multi_class="multinomial", C=0.1, verbose=True)
clf_LR_01.fit(X_train, y_train)      

pred_LR_01 = clf_LR_01.predict_proba(X_val_comb)
res_01 = []
for i in range(len(pred_LR_01)):
    res_01.append(pred_LR_01[i][y_val_comb[i]])

roc_auc_score(true_label, res_01)  #0.920612082248264

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =      7582788     M =           10

At X0         0 variables are exactly at the bounds


 This problem is unconstrained.



At iterate    0    f=  2.97275D+05    |proj g|=  1.36319D+02

At iterate   50    f=  2.04440D+05    |proj g|=  2.44436D+00

At iterate  100    f=  2.04435D+05    |proj g|=  8.91618D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
*****    100    108      1     0     0   8.916D-02   2.044D+05
  F =   204434.91123744042     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 23.6min finished


CPU times: user 1h 4min 47s, sys: 4min 7s, total: 1h 8min 55s
Wall time: 23min 48s


0.920612082248264

## 4. Read on test data

In [32]:
test_df.head()

Unnamed: 0,venue,keywords,year,coauthor,target
0,,"[260, 6, 390, 136, 7, 11, 17, 285, 288, 162, 4...",2017,[],988
1,94.0,"[260, 454, 137, 14, 400, 274, 339, 213, 280, 2...",2019,[1001],2123
2,31.0,"[390, 198, 7, 461, 462, 14, 404, 277, 24, 473,...",2014,[],1578
3,6.0,"[195, 6, 390, 10, 459, 464, 338, 146, 276, 466...",2010,[1347],2072
4,162.0,"[64, 1, 260, 457, 73, 147, 282, 27, 156, 43, 3...",2016,[1107],995


In [33]:
test_df.loc[test_df['venue']=='',['venue']]=470
venue_transformed_test = get_onehot_col(test_df['venue'], 471, "venue")

keywords_transformed_test = get_onehot_col(test_df['keywords'], 500, "keywords")

test_df['year'] = test_df['year']-2000
year_transformed_test = get_onehot_col(test_df['year'], 20, "year")

coauthor_transformed_test = get_onehot_col(test_df['coauthor'], 2302, "coauthor")

target_transformed_test = get_onehot_col(test_df['target'], 2302, "target")





In [34]:
transformed_df_comb_test = pd.concat([venue_transformed_test,keywords_transformed_test,year_transformed_test,coauthor_transformed_test],axis=1)
target = test_df['target']


In [35]:
transformed_df_comb_test.head()

Unnamed: 0,venue_0,venue_1,venue_2,venue_3,venue_4,venue_5,venue_6,venue_7,venue_8,venue_9,...,coauthor_2292,coauthor_2293,coauthor_2294,coauthor_2295,coauthor_2296,coauthor_2297,coauthor_2298,coauthor_2299,coauthor_2300,coauthor_2301
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
transformed_df_comb_test.shape

(2000, 3293)

## 5. Fit on test data - MNB
- AUC on Kaggle: 0.70735

In [44]:
pred_MNB_test = clf_NB.predict_proba(transformed_df_comb_test)
res_test_MNB = []
for i in range(len(pred_MNB_test)):
    res_test_MNB.append(pred_MNB_test[i][target[i]])



In [45]:
sample_df = pd.read_csv('comp90051-22-s1-p1/sample.csv',index_col=0)
sample_df['Predicted'] = res_test_MNB
sample_df.to_csv('predicted_MNB_onehot.csv')


## 5. Fit on test data - LR
- AUC on Kaggle: 0.88141

In [40]:
pred_LR_test = clf_LR_1.predict_proba(transformed_df_comb_test)
res_test = []
for i in range(len(pred_LR_test)):
    res_test.append(pred_LR_test[i][target[i]])


In [41]:
sample_df = pd.read_csv('comp90051-22-s1-p1/sample.csv',index_col=0)
sample_df['Predicted'] = res_test
sample_df.to_csv('predicted_LR.csv')