In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from random import shuffle
import random
random.seed(123)

In [57]:
train_df = pd.read_csv("train_with_label.txt", delimiter = "r'\t", header = None, engine = 'python')
train_df = train_df[0].str.split("\t", expand=True)
train_df = train_df.rename(columns={0: "id", 1: "sentence1", 2: "sentence2", 3: "classification"})
train_df

Unnamed: 0,id,sentence1,sentence2,classification
0,train_id_0,The Democratic candidates also began announcin...,The Democratic candidates also began announcin...,1
1,train_id_1,The woman was exposed to the SARS virus while ...,The woman was exposed to the SARS virus while ...,1
2,train_id_2,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,1
3,train_id_3,A representative for Phoenix-based U-Haul decl...,"Anthony Citrano , a representative for WhenU ,...",0
4,train_id_4,The biggest threat to order seemed to be looti...,The biggest threat to order seemed to be looti...,1
...,...,...,...,...
4072,train_id_4072,"Axelrod died in his sleep of heart failure , s...",Axelrod died of heart failure while asleep at ...,1
4073,train_id_4073,"Saddam 's other son , Odai , surrendered Frida...","Hussein 's other son , Uday , surrendered yest...",1
4074,train_id_4074,If Senator Clinton does decide to run in 2008 ...,If Mrs Clinton does decide to contest the 2008...,1
4075,train_id_4075,"The Iranian refugee who sewed up his eyes , li...","An Iranian Kurd who stitched up his eyes , lip...",1


In [59]:
dev_df = pd.read_csv("dev_with_label.txt", delimiter = "r'\t", header = None, engine = 'python')
dev_df = dev_df[0].str.split("\t", expand=True)
dev_df = dev_df.rename(columns={0: "id", 1: "sentence1", 2: "sentence2", 3: "classification"})
dev_df

Unnamed: 0,id,sentence1,sentence2,classification
0,dev_id_0,Local police authorities are treating the expl...,Acting New Haven Police Chief Francisco Ortiz ...,0
1,dev_id_1,The report shows that drugs sold in Canadian p...,The report shows that drugs sold in Canadian p...,1
2,dev_id_2,The transition is slated to begin no later tha...,A two-week transition period will begin no lat...,1
3,dev_id_3,"Like Viacom , GE -- parent of NBC -- is also s...","Like Viacom , General Electric is seen as a le...",1
4,dev_id_4,"Last month , 62 Spanish peacekeepers died when...","In another disaster , 62 Spanish peacekeepers ...",1
...,...,...,...,...
719,dev_id_719,"He is a brother to three-year-old Mia , from K...","Winslet , 28 , has a three-year-old daughter M...",0
720,dev_id_720,Some 175 million shares traded on the Big Boar...,Some 1.6 billion shares traded on the Big Boar...,0
721,dev_id_721,Mr Berlusconi is accused of bribing judges to ...,Mr Berlusconi is accused of bribing judges to ...,1
722,dev_id_722,"He added that those "" are not solely American ...",""" These are not solely American principles nor...",1


In [62]:
x_train = train_df.drop('classification', axis = 1).values
y_train = train_df['classification'].values
x_train_val = dev_df.drop('classification', axis = 1).values
y_train_val = dev_df['classification'].values

print("train: {}, val: {}".format(x_train.shape[0], x_train_val.shape[0]))

train: 4077, val: 724


In [64]:
#We need to learn the model parameter  𝐰 . 
#However, with different hyperparameters  𝜆 , we can get different model parameter  𝐰 , resulting in different prediction performance. 
#Thus, we will use the 10-fold cross-validation to select the hyperparameter  𝜆 .

#Here we set the folds equal to 10 for 10-fold cross-validation
folds = 9

#We get the number of samples in the training and validation set
num_train = x_train.shape[0] 

#Now, we shuffle the index of samples in the train_val set
index_of_samples = np.arange(num_train) 
shuffle(index_of_samples)

#We split the index of the train_valid set into 10 folds
index_of_folds = index_of_samples.reshape(folds, -1)
print(index_of_folds)

#As suggested above, the hyperparameters chosen are listed below
regularization_coefficient = [10**(-5), 10**(-3), 10**(-2), 10**(-1), 1, 10, 20, 50]

#Variables we create to store the values of the best accuracy and best regression:
best_acc = 0.0
best_reg = 0.0

for reg in regularization_coefficient:
    #10-fold cross-validation
    sum_acc = 0.0
    for fold in range(folds):
        
        index_of_folds_temp = index_of_folds.copy()
        
        #We are getting the index of the validation set and storing it in a variable valid_index
        valid_index = index_of_folds_temp[fold,:].reshape(-1) 
        #We are getting the index of the training set and storing it in a variable train_index
        train_index = np.delete(index_of_folds_temp, fold, 0).reshape(-1)
        
        #Our training set:
        X_train = x_train[train_index]
        Y_train = y_train[train_index]
        
        #Our validation set:
        X_valid = x_train[valid_index]
        Y_valid = y_train[valid_index]
                
        #We write this to build the model with different hyperparameters:
        clf = LogisticRegression(penalty='l2', C=reg, solver='lbfgs')
        
        #Train the model with the training set:
        clf.fit(X_train, Y_train)
        
        y_valid_pred = clf.predict(X_valid)
        acc = accuracy_score(Y_valid, y_valid_pred)
        
        sum_acc += acc
    
    cur_acc = sum_acc / folds
    
    print("reg_coeff: {}, acc: {:.3f}".format(1.0/reg, cur_acc))
    
    #We now want to store the best hyperparameter:
    if cur_acc > best_acc:
        best_acc = cur_acc
        best_reg = reg
        
print("Best Accuracy: {:.4f} ".format(best_acc))
print("Best Reg: {:}".format(best_reg))

[[2189  397 3245 ...  502 1281 1071]
 [3323 2138 2407 ... 1399 2472 1970]
 [1009 2755 1669 ... 2788  739   56]
 ...
 [1905 1931 3405 ...  616  642  381]
 [3882  154 2514 ... 3834 3684 2297]
 [2880 2766 1795 ... 2495 3023 3716]]


ValueError: could not convert string to float: 'train_id_3323'

In [None]:
#Normalizing the features:
normalizer = StandardScaler()
X_train_val = normalizer.fit_transform(X_train_val)
X_test = normalizer.transform(X_test)

print(X_test.shape)
print(X_train_val.shape)