In [1]:
import re
import time
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
import csv
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=sklearn.exceptions.UndefinedMetricWarning)

In [6]:
# train on Pass1's reworked GTD
df_train = pd.read_csv('data/GTxM_Pass1/reGTr_Tokens.csv', encoding='ISO-8859-1')
# test (predict) on Pass2's CGT labeled data with semantic score >= 0.7
df_test = pd.read_csv('data/GTxM_Pass2/CGT_Labeled_Tokens_0_70.csv', encoding='ISO-8859-1')
#data = pd.concat([d1,d2], axis=0)

In [7]:
len(df_train), len(df_test)

(854, 407)

In [4]:
df_train.head(2)

Unnamed: 0.1,Unnamed: 0,RecID,Target,Label,rowid,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,0,1089699762331217920,1,Business,192,Peloton exercise bike ad mocked as being 'sexi...,Love putting my Peloton bike in the most strik...,118.0,hilarious pton homesteadexemption realproblems...,profgalloway dpshow dirrtydut danglebus nicksc...,peloton serious later coupl monthli minut king...,love peloton bike area hous wife glanc hubbi e...,keep trap doe pedal whogco buy broke buy play ...,matter away serious later fast btw though outd...,strike ultra modern nervou dark right perfect ...,Love putting my Peloton bike in the most strik...,Love putting my Peloton bike in the most strik...
1,1,1139309394968096768,6,Politics,291,Six Takeaways From Senators' Questions to Impe...,I would not have thought that I needed to say ...,25.0,clintonfoundation corruptcomplicitgop clintons...,ellenlweintraub k9dancerpovey nypapajoe killer...,agenda us elect statu quo unsustain american b...,intern corpor polit agenda us elect polit dona...,control influenc destroy wipe cheat breath myb...,care right behind forward total appar,vast unsustain unaccept question possibl fair ...,I would not have thought that I needed to say ...,I would not have thought that I needed to say ...


In [5]:
df_test.head(2)

Unnamed: 0.1,Unnamed: 0,RecID,Label,Target,rowid,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,0,1223365339494453248,Politics,6,4682,"At the Stroke of Brexit, Britain Steps, Guarde...",Tonight we are leaving the European Union. htt...,260.0,leavers happynewstart democraticcountry brexit...,conservatives 10downingstreet geertwilderspvv ...,tonight european union fa auo auo au auu au au...,tonight european union fc fc fc fc fc fc fc fc...,leav sold lie pcepuepu pufpu paapa pa pe paepu...,fcofaea serious though otherwis fals forward f...,british fyefyefyefyefyefyefyefyefyefyefa auo a...,Tonight we are leaving the European Union. htt...,now and always fAE!fA+fc!fc|fc!fc|fc!fc|fc!fc|...
1,1,1222952347548164098,Politics,6,4662,"U.K. Leaves E.U., Embarking On an Uncertain Fu...",I am sad to see our British friends leave the ...,260.0,bernardtapie loveit 0doubt italy france brexit...,fathonie_ag_top devonlass nevenmaguire _hadley...,british eu brexit european us uk gcpsbasket br...,friend eu mandat brexit disrupt citizen employ...,leav act ensur unit understand mean trust unde...,perhap hope anytim qu un fulli therebi vigor f...,sad british littl possibl financi western brit...,I am sad to see our British friends leave the ...,I am sad to see our British friends leave the ...


In [6]:
#df_train[['Label','Target']].groupby(['Label','Target']).size()
df_train.groupby(['Label']).size()

Label
Business           73
Entertainment     143
Environmental       7
Human Rights       53
Law and Order       1
Obituary          100
Politics          380
Social Stories     24
Sports             73
dtype: int64

In [7]:
# Remove Environmental, Health, Social Stories and 'Law and Order' since the count of SMRs are less than 50
df_train = df_train[df_train.Label != 'Environmental']
df_train = df_train[df_train.Label != 'Health']
df_train = df_train[df_train.Label != 'Law and Order']
df_train = df_train[df_train.Label != 'Social Stories']

In [8]:
len(df_train)

822

In [9]:
df_train.groupby(['Label']).size()

Label
Business          73
Entertainment    143
Human Rights      53
Obituary         100
Politics         380
Sports            73
dtype: int64

In [10]:
# Code the Targets 0-5
df_train.loc[(df_train.Target == 1), 'Target'] = 0 # Business
df_train.loc[(df_train.Target == 2), 'Target'] = 1 # Entertainment
df_train.loc[(df_train.Target == 5), 'Target'] = 2 # Human Rights
df_train.loc[(df_train.Target == 6), 'Target'] = 3 # Politics
df_train.loc[(df_train.Target == 9), 'Target'] = 4 # Obituary
df_train.loc[(df_train.Target == 11), 'Target'] = 5 # Sports

In [11]:
df_test.groupby(['Label']).size()

Label
Business            4
Entertainment      22
Environmental      40
Health              4
Human Rights       40
Law and Order      37
Obituary           64
Politics          184
Social Stories      9
Sports              3
dtype: int64

In [12]:
# Remove Environmental, Health, Social Stories and 'Law and Order' since the count of TRAINING SMRs are less than 50
df_test = df_test[df_test.Label != 'Environmental']
df_test = df_test[df_test.Label != 'Health']
df_test = df_test[df_test.Label != 'Law and Order']
df_test = df_test[df_test.Label != 'Social Stories']

In [13]:
len(df_test)

317

In [14]:
df_test.groupby('Label').size()

Label
Business           4
Entertainment     22
Human Rights      40
Obituary          64
Politics         184
Sports             3
dtype: int64

In [15]:
# Code the Targets 0-5
df_test.loc[(df_test.Target == 1), 'Target'] = 0 # Business
df_test.loc[(df_test.Target == 2), 'Target'] = 1 # Entertainment
df_test.loc[(df_test.Target == 5), 'Target'] = 2 # Human Rights
df_test.loc[(df_test.Target == 6), 'Target'] = 3 # Politics
df_test.loc[(df_test.Target == 9), 'Target'] = 4 # Obituary
df_test.loc[(df_test.Target == 11), 'Target'] = 5 # Sports

In [16]:
df_test

Unnamed: 0.1,Unnamed: 0,RecID,Label,Target,rowid,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,0,1223365339494453248,Politics,3,4682,"At the Stroke of Brexit, Britain Steps, Guarde...",Tonight we are leaving the European Union. htt...,260.0,leavers happynewstart democraticcountry brexit...,conservatives 10downingstreet geertwilderspvv ...,tonight european union fa auo auo au auu au au...,tonight european union fc fc fc fc fc fc fc fc...,leav sold lie pcepuepu pufpu paapa pa pe paepu...,fcofaea serious though otherwis fals forward f...,british fyefyefyefyefyefyefyefyefyefyefa auo a...,Tonight we are leaving the European Union. htt...,now and always fAE!fA+fc!fc|fc!fc|fc!fc|fc!fc|...
1,1,1222952347548164098,Politics,3,4662,"U.K. Leaves E.U., Embarking On an Uncertain Fu...",I am sad to see our British friends leave the ...,260.0,bernardtapie loveit 0doubt italy france brexit...,fathonie_ag_top devonlass nevenmaguire _hadley...,british eu brexit european us uk gcpsbasket br...,friend eu mandat brexit disrupt citizen employ...,leav act ensur unit understand mean trust unde...,perhap hope anytim qu un fulli therebi vigor f...,sad british littl possibl financi western brit...,I am sad to see our British friends leave the ...,I am sad to see our British friends leave the ...
2,2,1222288749813518339,Politics,3,4598,Day 7 of Trump's Trial: The Defense Rests,Big WSJ scoop: McConnell tells R senators he *...,257.0,qanon trumpiseffed documentsandwitnesses mosco...,cherkalleck daveweigel alandersh contentedindi...,wsj mcconnel john bolton gci feinstein tuesday...,wsj scoop mcconnel senat vote wit john bolton ...,block left judg base judg finish reel block st...,absolut correctli possibl frankli separ togeth...,big presidentgco great antithet actual shock s...,Big WSJ scoop: McConnell tells R senators he *...,Big WSJ scoop: McConnell tells R senators he *...
3,3,1222281539100250114,Entertainment,1,4597,Kobe Bryant: Washington Post reporter reinstat...,New statement regarding Post reporter Felicia ...,260.0,himtoo whiteprivilege fakenewsmedia firefelici...,securemysocial washingtonpost fahrenthold jeff...,post felicia sonmez american washington post g...,statement post report felicia sonmez bullshit ...,regard refer employ defend cave hate lower act...,hell fyireal absolut daili probabl complet cer...,absolut unprofession warm dead sanctimoni weak...,New statement regarding Post reporter Felicia ...,New statement regarding Post reporter Felicia ...
4,4,1222242112307187712,Obituary,4,4588,Dinosaur Love Song: Why this 3-year-oldï¿½??s ...,"Fenn, my nearly 4 year old daughter, recorded ...",260.0,littlegirl kidsfirstsavingsfund todayneedshope...,hoarsewisperer doddleoddle sarabbrooks arlened...,nearli dinosaur love four couldngcot reconcil ...,fenn daughter solo song word bit tune dinosaur...,record help couldngcot reconcil issu knock cho...,nearli later instead later nicer absolut actua...,littl cute neoliber creativ final littl wonder...,"Fenn, my nearly 4 year old daughter, recorded ...","Fenn, my nearly 4 year old daughter, recorded ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,400,1221837195931193346,Politics,3,4536,"Surprise, Mr. President. John Bolton Has the G...",Sen. Mitt Romney responds to reports of John B...,10.0,,abcpolitics,mitt romney john bolton john bolton republican...,sen mitt romney report john bolton draft manus...,respond provid join hear ad admit taken believ...,increasingli earli,relev hard republican american corrupt jealou ...,Sen. Mitt Romney responds to reports of John B...,Sen. Mitt Romney responds to reports of John B...
401,401,1222335744687198209,Sports,5,4605,Kobe Bryant: Washington Post reporter reinstat...,I believe that Washington Post readers and emp...,260.0,freedomofspeech kobebraynt fail firefeliciason...,lizzylynngarcia feliciasonmez washingtonpost k...,washington post post tonight simpson robert bl...,washington post reader employe handl matter st...,believ includ deserv hear murder found base li...,directli perhap absolut tragic togeth long poo...,newspap common similar guilti common fae fae f...,I believe that Washington Post readers and emp...,My statement on The Post?s decision tonight: P...
404,404,1222872177604775937,Politics,3,4650,"L'ï¿½tat, C'est Trump",I was left stunned by the argument that Dersho...,84.0,trumpocalypse inourbestinterest dearthofwit un...,blogdiva gopsenate senrobportman alandersh dem...,dershowitz trump yesterday american mcconnel s...,argument dershowitz trump team yesterday kind ...,left heard doe reject reject count corrupt cor...,truli fulli absolut liter basic therefor long ...,stun legal authoritarian legitim american dead...,I was left stunned by the argument that Dersho...,I was left stunned by the argument that Dersho...
405,405,1222910064236744704,Politics,3,4658,"L'ï¿½tat, C'est Trump","Assuming the Senate votes against witnesses, a...",87.0,impeachandremovetrumpnow moreimpeaching obstru...,delavegalaw medit8now amandionair lanning_laur...,senat trump bolton mulvaney giuliani american ...,senat wit trump bolton mulvaney giuliani peopl...,assum vote expect subpoena entitl hear play ex...,immedi exactli sadli care long away right espe...,american obscen sham explicit democrat biparti...,"Assuming the Senate votes against witnesses, a...","Assuming the Senate votes against witnesses, a..."


In [17]:
# set Human Tights to Politics -- NOTE: did not improve the accuracy, so not used
# df_train.loc[(df_train.Target == '5'), 'Target'] = '6'
# df_train.loc[(df_train.Label == 'Human Rights'), 'Label'] = 'Politics'
# df_test.loc[(df_test.Target == '5'), 'Target'] = '6'
# df_test.loc[(df_test.Label == 'Human Rights'), 'Label'] = 'Politics'

In [18]:
scoring = {'acc': 'accuracy',
           'prec': 'precision_weighted',
           'recall': 'recall_weighted',
           'f1': 'f1_weighted'}

#test_ratio = 0.20
corpus = df_train['smrNouns'] + df_train['smrAdverbs']
# corpus = df_train['smrNouns'] + df_train['smrNER'] +df_train['smrAdverbs'] + df_test['smrAdjectives']
# corpus = df_train['smrNER'] + df_test['smrAdjectives']
corpus = corpus.fillna(value='')
vec = 'TFIDF'
# vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=10000)
vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=5000)
data_vec = vectorizer.fit_transform(corpus)
vec_dtm = pd.DataFrame(data_vec.toarray(), columns=vectorizer.get_feature_names())
vec_dtm.index = df_train['RecID']
y = df_train['Target']
y.index = df_train['RecID']
y=y.astype('int')
X = vec_dtm


In [19]:
X.head()

Unnamed: 0_level_0,ab,abbott,abc,abil,abomin,abort,abov,abroad,absenc,absolut,...,young,yourselv,youth,youtub,ypg,zelenski,zero,zone,zuck,zuckerberg
RecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1089699762331217920,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139309394968096768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1159148971106942981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1166443046361153537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1175764155359465478,0.300329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
y.head()

RecID
1089699762331217920    0
1139309394968096768    3
1159148971106942981    5
1166443046361153537    0
1175764155359465478    5
Name: Target, dtype: int32

In [21]:
clf = SVC(kernel='linear')
clf.fit(X, y)

SVC(kernel='linear')

In [22]:
# setup test data
corpus_test = df_test['smrNouns'] + df_test['smrAdverbs']
# corpus_test = df_test['smrNouns'] + df_test['smrNER'] + df_test['smrAdverbs'] + df_test['smrAdjectives']
# corpus_test = df_test['smrNER'] + df_test['smrAdjectives']
corpus_test = corpus_test.fillna(value='')
# vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=10000)
vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=5000)
data_vec = vectorizer.fit_transform(corpus_test)
vec_dtm = pd.DataFrame(data_vec.toarray(), columns=vectorizer.get_feature_names())
vec_dtm.index = df_test['RecID']
y_test = df_test['Target']
y_test.index = df_test['RecID']
y_test=y_test.astype('int')
X_test = vec_dtm

In [23]:
df_test['Target']

RecID
1223365339494453248    3
1222952347548164098    3
1222288749813518339    3
1222281539100250114    1
1222242112307187712    4
                      ..
1221837195931193346    3
1222335744687198209    5
1222872177604775937    3
1222910064236744704    3
833502973204459520     1
Name: Target, Length: 317, dtype: int64

In [24]:
X_test

Unnamed: 0_level_0,aap,aatish,ab,abandon,abc,abd,abdullah,abil,abomin,abort,...,yourselv,youth,youtub,ypg,zarif,zelenski,zero,zone,zuck,zuckerberg
RecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1223365339494453248,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.005821,0.003094,0.0,...,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1222952347548164098,0.0,0.0,0.000000,0.0,0.0,0.0,0.019416,0.000000,0.000000,0.0,...,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.013045,0.0,0.0
1222288749813518339,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1222281539100250114,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.011642,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1222242112307187712,0.0,0.0,0.006776,0.0,0.0,0.0,0.000000,0.004862,0.000000,0.0,...,0.000000,0.006026,0.00818,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1221837195931193346,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1222335744687198209,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.007157,0.000000,0.0,...,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1222872177604775937,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1222910064236744704,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [25]:
test_pred = clf.predict(X_test)

In [26]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod only:
result = ['GTxM Pass 2', 'SVM', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [27]:
result

['GTxM Pass 2',
 'SVM',
 0,
 57.413249211356465,
 38.3153250534954,
 57.413249211356465,
 45.95454794859168]

In [28]:
df_test_pred = pd.DataFrame(test_pred, columns=['SVMPred']).set_index(y_test.index)

In [29]:
df_test_pred

Unnamed: 0_level_0,SVMPred
RecID,Unnamed: 1_level_1
1223365339494453248,3
1222952347548164098,3
1222288749813518339,3
1222281539100250114,3
1222242112307187712,3
...,...
1221837195931193346,3
1222335744687198209,3
1222872177604775937,3
1222910064236744704,3


In [38]:
df_pred = pd.concat([y_test.to_frame(),df_test_pred['SVMPred']], axis=1)

In [39]:
df_pred

Unnamed: 0_level_0,Target,SVMPred
RecID,Unnamed: 1_level_1,Unnamed: 2_level_1
1223365339494453248,3,3
1222952347548164098,3,3
1222288749813518339,3,3
1222281539100250114,1,3
1222242112307187712,4,3
...,...,...
1221837195931193346,3,3
1222335744687198209,5,3
1222872177604775937,3,3
1222910064236744704,3,3


### Generate Confusion Matrix

In [33]:
svm_cm = confusion_matrix(y_test, test_pred)
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass2/SVM_Confusion.csv')

In [34]:
#NOTE: The predictions for BERT and XLNet need to collected from GDrive, 
#placed in results\GTxM_Pass1 for continuing the code below.

### Write the predictions

In [40]:
# Reverse the target codes
df_pred.loc[(df_pred.SVMPred == 5), 'SVMPred'] = 11 # Sports
df_pred.loc[(df_pred.SVMPred == 4), 'SVMPred'] = 9 # Obituary
df_pred.loc[(df_pred.SVMPred == 3), 'SVMPred'] = 6 # Politics
df_pred.loc[(df_pred.SVMPred == 2), 'SVMPred'] = 5 # Human Rights
df_pred.loc[(df_pred.SVMPred == 1), 'SVMPred'] = 2 # Entertainment
df_pred.loc[(df_pred.SVMPred == 0), 'SVMPred'] = 1 # Busines

df_pred.loc[(df_pred.Target == 5), 'Target'] = 11 # Sports
df_pred.loc[(df_pred.Target == 4), 'Target'] = 9 # Obituary
df_pred.loc[(df_pred.Target == 3), 'Target'] = 6 # Politics
df_pred.loc[(df_pred.Target == 2), 'Target'] = 5 # Human Rights
df_pred.loc[(df_pred.Target == 1), 'Target'] = 2 # Entertainment
df_pred.loc[(df_pred.Target == 0), 'Target'] = 1 # Busines

In [46]:
# convert the index named RecID to proper column
df_pred.reset_index(inplace=True)

In [78]:
df_pred

Unnamed: 0,RecID,SVMPred
0,1223365339494453248,6
1,1222952347548164098,6
2,1222288749813518339,6
3,1222281539100250114,6
4,1222242112307187712,6
...,...,...
312,1221837195931193346,6
313,1222335744687198209,6
314,1222872177604775937,6
315,1222910064236744704,6


In [51]:
df_pred.drop(['Target'], axis=1, inplace=True)

In [79]:
df_intercoder = pd.read_csv('results/GTxM_Pass2/GTxM_Intercoder_Pred.csv')

In [80]:
df_intercoder

Unnamed: 0,InReplyTo,Label,Target
0,1223365339494453248,Politics,6
1,1222952347548164098,Politics,6
2,1222288749813518339,Politics,6
3,1222281539100250114,Entertainment,2
4,1222242112307187712,Obituary,9
...,...,...,...
402,1223302445889150976,Environmental,3
403,1222860231262318593,Social Stories,10
404,1222872177604775937,Politics,6
405,1222910064236744704,Politics,6


In [None]:
df_intercoder_updated = pd.merge(df_intercoder, df_pred, on=['RecID'], how='left')

In [63]:
df_intercoder_updated

Unnamed: 0.1,Unnamed: 0,RecID,Label,Target,SVMPred
0,325,1223365339494453248,Politics,6,6.0
1,324,1222952347548164098,Politics,6,6.0
2,323,1222288749813518339,Politics,6,6.0
3,321,1222281539100250114,Entertainment,2,6.0
4,320,1222242112307187712,Obituary,9,6.0
...,...,...,...,...,...
402,157,1223302445889150976,Environmental,3,
403,158,1222860231262318593,Social Stories,10,
404,159,1222872177604775937,Politics,6,6.0
405,160,1222910064236744704,Politics,6,6.0


In [64]:
# NOTE: -1 is used in the intercoder code for missing values
df_intercoder_updated.SVMPred = df_intercoder_updated.SVMPred.fillna(-1).astype(int)

In [65]:
df_intercoder_updated

Unnamed: 0.1,Unnamed: 0,RecID,Label,Target,SVMPred
0,325,1223365339494453248,Politics,6,6
1,324,1222952347548164098,Politics,6,6
2,323,1222288749813518339,Politics,6,6
3,321,1222281539100250114,Entertainment,2,6
4,320,1222242112307187712,Obituary,9,6
...,...,...,...,...,...
402,157,1223302445889150976,Environmental,3,-1
403,158,1222860231262318593,Social Stories,10,-1
404,159,1222872177604775937,Politics,6,6
405,160,1222910064236744704,Politics,6,6


In [75]:
df_intercoder_updated.to_csv('results/GTxM_Pass2/GTxM_Intercoder_Pred.csv', index=False)

#### DO NOT USE THE CODE BELOW

In [None]:
# KEPT ONLY FOR POSSIBLE FUTURE USE/REFERENCE

In [28]:
# For Prod:
X_train = pd.merge(pd.read_csv('data/GTxM_Pass1/X_train_RecID.csv').set_index('RecID'),X, left_index=True, right_index=True)
X_test = pd.merge(pd.read_csv('data/GTxM_Pass1/X_test_RecID.csv').set_index('RecID'),X, left_index=True, right_index=True)
y_train = pd.merge(pd.read_csv('data/GTxM_Pass1/y_train_Target.csv').set_index('RecID'),y, left_index=True, right_index=True)
y_test = pd.merge(pd.read_csv('data/GTxM_Pass1/y_test_Target.csv').set_index('RecID'),y, left_index=True, right_index=True)

In [36]:
# cleanup the duplicate columns from joins
X_train.drop(['RecID.1'], axis=1, inplace=True)
X_test.drop(['RecID.1'], axis=1, inplace=True)
y_train.drop(['Target_x'], axis=1, inplace=True)
y_train.rename(columns={'Target_y': 'Target'}, inplace=True)

In [None]:
.rename(columns={'RecID': 'delRecID'}, inplace=True)