In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.svm import SVC

pd.options.mode.chained_assignment = None

In [None]:
train = pd.read_csv("./Data/train.csv")
test = pd.read_csv("./Data/test.csv")
print(train.shape)
print(test.shape)
#train.head()

(2797, 986)
(614, 986)


In [None]:
cols = train.columns[0:5]
y = train['label']
train.drop(columns=cols,inplace=True)
#train.describe()

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(train)

In [None]:
cols = test.columns[0:5]
id = test[['ID']]
test.drop(columns=cols,inplace=True)
Xtest = scaler.transform(test)

####1. Logistic regression

In [None]:
#Parameter tuning
bScore = 0
cVal = -1
pSpace = np.arange(0.1,2,0.1)
for c in pSpace:
  model = LogisticRegression(random_state=0,max_iter=1000,C=c)
  scores = cross_val_score(model, X, y, cv=5, scoring="f1")
  mScore = scores.mean()
  if mScore>bScore:
    bScore = mScore
    cVal = c
print(f"C : {cVal} Score : {bScore}")

C : 1 Score : 0.5965253838684129


In [None]:
model = LogisticRegression(random_state=0,max_iter=1000,C=1).fit(X,y)
id['label'] = model.predict(Xtest)
#id.to_csv("./Results/Pred_LR.csv",index=False)

In [None]:
z=[(x,y) for y,x in sorted(zip(model.coef_[0,:],train.columns),reverse=True)]
print(*z[:20],sep="\n")

('BOW 556', 1.5566062962000449)
('BOW 179', 1.0805307399045896)
('title content 21.1', 1.0556612896766293)
('BOW 436', 1.0479340628636307)
('similar similar 12.1', 1.0369063157652054)
('title content start 12.1', 1.035694173117478)
('BOW 653', 1.006027689098631)
('extracted extracted 12.11', 1.0009042440238103)
('similar similar 21.3', 0.9521374957244352)
('BOW 389', 0.9455135326189658)
('BOW 115', 0.8891165349915321)
('title 12', 0.8796219746959211)
('links links 21.2', 0.8774736076387655)
('BOW 431', 0.8770740037500787)
('title content 12.3', 0.858098003832005)
('BOW 403', 0.8130736490516094)
('similar title 12', 0.8016624686074214)
('BOW 516', 0.7930259680823858)
('similar title 21.10', 0.789600040648109)
('BOW 381', 0.738694390413435)


### 2. Random forests

In [None]:
#Parameter tuning
bScore = 0
p1Val = -1
p2Val = -1
p1Space = np.arange(100,800,100)
p2Space = np.arange(15,20,1)
for p1 in p1Space:
  for p2 in p2Space:
    model = RandomForestClassifier(n_estimators=p1, max_depth=p2, random_state=0)
    scores = cross_val_score(model, X, y, cv=5, scoring="f1")
    mScore = scores.mean()
    if mScore>bScore:
      bScore = mScore
      p1Val = p1
      p2Val = p2
print(f"N_estimators : {p1Val} Max_depth : {p2Val} Score : {bScore}")

N_estimators : 400 Max_depth : 17 Score : 0.6501174061784597


In [None]:
model = RandomForestClassifier(n_estimators=400, max_depth=17, random_state=0).fit(X,y)
id['label'] = model.predict(Xtest)
#id.to_csv("./Results/Pred_RF.csv",index=False)

In [None]:
z=[(x,y) for y,x in sorted(zip(model.feature_importances_,train.columns),reverse=True)]
print(*z[:20],sep="\n")

('title content 12.3', 0.020851131450087487)
('links links 12.1', 0.014240226471088873)
('links links 21.1', 0.013932188281713533)
('links links 21.5', 0.012304120816630912)
('title content start 12.3', 0.011451009132996624)
('content sim.1', 0.010937609451023862)
('content sim', 0.01089424555815265)
('title content 21.3', 0.009910955951348574)
('title content 12.4', 0.009605613345423881)
('links links 12.5', 0.00850114581424582)
('links title 12', 0.008422019796471138)
('title content 21.7', 0.008408045270684906)
('title 12', 0.008389198018399952)
('links links 12', 0.008310969499731388)
('title content 12.1', 0.00806078139870096)
('title content 21.8', 0.007796981648684179)
('links title 12.1', 0.00762567388763995)
('title content 21.4', 0.007124185637265965)
('title content 12.5', 0.007117845478346589)
('title content 12.6', 0.007076412588528119)


### 3. ExtraTrees

In [None]:
#Parameter tuning
bScore = 0
p1Val = -1
p2Val = -1
p1Space = np.arange(100,600,50)
p2Space = np.arange(16,25,1)
for p1 in p1Space:
  for p2 in p2Space:
    model = ExtraTreesClassifier(n_estimators=p1, max_depth=p2, random_state=0)
    scores = cross_val_score(model, X, y, cv=5, scoring="f1")
    mScore = scores.mean()
    if mScore>bScore:
      bScore = mScore
      p1Val = p1
      p2Val = p2
print(f"N_estimators : {p1Val} Max_depth : {p2Val} Score : {bScore}")

N_estimators : 150 Max_depth : 22 Score : 0.651221488605764


In [None]:
model = ExtraTreesClassifier(n_estimators=p1Val, max_depth=p2Val, random_state=0).fit(X,y)
id['label'] = model.predict(Xtest)
#id.to_csv("./Results/Pred_ET.csv",index=False)

In [None]:
z=[(x,y) for y,x in sorted(zip(model.feature_importances_,train.columns),reverse=True)]
print(*z[:20],sep="\n")

('title content 12.3', 0.014671437369268086)
('title content start 12.3', 0.01042361812058658)
('BOW 359', 0.00899295542747171)
('links links 12.1', 0.008772747955915573)
('title 12', 0.008375940617386501)
('links title 12.6', 0.00822563210217273)
('links links 21.1', 0.007519675721107407)
('links title 21.5', 0.007505993561191141)
('title content 21.3', 0.0074277433201988315)
('title content 12.9', 0.00607161461251757)
('content sim', 0.006059709668084181)
('content sim.1', 0.005951962780965822)
('title content start 21.3', 0.005762616889080573)
('title content 12.6', 0.0057290127693166845)
('links title 12.7', 0.005727057592856329)
('title 12.10', 0.005708057851403439)
('links title 21.6', 0.005394213581507397)
('title content start 12.1', 0.0051305349551598635)
('title content start 12.9', 0.005081110049577824)
('links title 12.5', 0.005008840900804355)


####4. SVM

In [None]:
#Parameter tuning
bScore = 0
cVal = -1
pSpace = np.arange(0.1,2,0.1)
for c in pSpace:
  model = SVC(random_state=0,C=c,kernel='sigmoid')
  scores = cross_val_score(model, X, y, cv=5, scoring="f1")
  mScore = scores.mean()
  if mScore>bScore:
    bScore = mScore
    cVal = c
print(f"C : {cVal} Score : {bScore}")

C : 1.3000000000000003 Score : 0.642490605844281


In [None]:
model = SVC(random_state=0,C=1.3,kernel='sigmoid').fit(X,y)
id['label'] = model.predict(Xtest)
#id.to_csv("./Results/Pred_SVC.csv",index=False)