In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('/content/drive/MyDrive/NLP_project_dataset/file2.csv')
test = pd.read_csv('/content/drive/MyDrive/NLP_project_dataset/test.csv')
sample = pd.read_csv('/content/drive/MyDrive/NLP_project_dataset/sample_submission.csv')

In [4]:
embeds = pd.read_csv('/content/drive/MyDrive/NLP_project_dataset/enbedding.csv')

In [5]:
embeds['Type'] = train['discourse_type']
embeds['Type'].unique()

array(['Lead', 'Position', 'Claim', 'Evidence', 'Counterclaim',
       'Rebuttal', 'Concluding Statement'], dtype=object)

In [6]:
dict_discourse = {
    'Lead': 1, 
    'Position' : 2, 
    'Claim' : 3, 
    'Evidence' : 4, 
    'Counterclaim' : 5,
    'Rebuttal' : 6, 
    'Concluding Statement' : 7
}

In [7]:
embeds['Type'] = embeds['Type'].replace(dict_discourse)

In [8]:
# embeds['Target'] = train['discourse_effectiveness'].replace({"Ineffective": 1, "Effective": 1, "Adequate": 0})

In [9]:
target = train['discourse_effectiveness'].replace({"Ineffective": -1, "Effective": 1, "Adequate": 0})

In [10]:
# target = embeds['Target']
# embeds = embeds.drop(columns=['Target'])

In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV

In [12]:
from sklearn.linear_model import LogisticRegression as lr

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(embeds, target, test_size=0.2, random_state=42)

**Support Vector Classifier**

In [14]:
svm = SVC()

In [15]:
parameters = {'kernel': ['rbf'], 'C': [0.1]}
clf = GridSearchCV(svm, parameters)
clf.fit(X_train, y_train)

In [16]:
print("Best parameters: ", clf.best_params_)
print("Best score: ", clf.best_score_)

Best parameters:  {'C': 0.1, 'kernel': 'rbf'}
Best score:  0.6447031404580179


In [17]:
y_pred = clf.predict(X_valid)

In [18]:
y_pred_l = list(y_pred)
y_valid_l = list(y_valid)

In [19]:
from sklearn.metrics import classification_report as cr, confusion_matrix as cm
print(cr(y_valid_l, y_pred_l, labels=[-1,0,1]))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      1282
           0       0.64      0.94      0.76      4246
           1       0.73      0.44      0.55      1825

    accuracy                           0.65      7353
   macro avg       0.46      0.46      0.44      7353
weighted avg       0.55      0.65      0.57      7353



In [20]:
print(cm(y_valid_l, y_pred_l, labels=[-1,0,1]))

[[   0 1261   21]
 [   0 3972  274]
 [   1 1020  804]]


**Logistic Regression**

In [21]:
model = lr()

In [22]:
params = {'penalty': ['l2'], 'C':[0.1],'solver': ['liblinear'] }
clf_2 = GridSearchCV(model, params)
clf_2.fit(X_train, y_train)

In [23]:
print("Best parameters: ", clf_2.best_params_)
print("Best score: ", clf_2.best_score_)

Best parameters:  {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best score:  0.6674148329232917


In [24]:
y_pred = clf_2.predict(X_valid)

In [25]:
y_pred_l = list(y_pred)
y_valid_l = list(y_valid)

In [26]:
from sklearn.metrics import classification_report as cr, confusion_matrix as cm
print(cr(y_valid_l, y_pred_l, labels=[-1,0,1]))

              precision    recall  f1-score   support

          -1       0.54      0.15      0.23      1282
           0       0.67      0.86      0.75      4246
           1       0.69      0.58      0.63      1825

    accuracy                           0.67      7353
   macro avg       0.63      0.53      0.54      7353
weighted avg       0.65      0.67      0.63      7353



In [27]:
print(cm(y_valid_l, y_pred_l, labels=[-1,0,1]))

[[ 190 1059   33]
 [ 143 3655  448]
 [  16  749 1060]]


**Run-III**

In [28]:
params = {'penalty': ['l2'], 'C':[0.1],'solver': ['saga'], 'max_iter': [500] }
clf_4 = GridSearchCV(model, params)
clf_4.fit(X_train, y_train)

In [29]:
print("Best parameters: ", clf_4.best_params_)
print("Best score: ", clf_4.best_score_)

Best parameters:  {'C': 0.1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
Best score:  0.6654427550541695


In [30]:
y_pred = clf_4.predict(X_valid)

In [31]:
y_pred_l = list(y_pred)
y_valid_l = list(y_valid)

In [32]:
print(cr(y_valid_l, y_pred_l, labels=[-1,0,1]))

              precision    recall  f1-score   support

          -1       0.53      0.20      0.29      1282
           0       0.67      0.84      0.75      4246
           1       0.68      0.59      0.63      1825

    accuracy                           0.67      7353
   macro avg       0.63      0.54      0.56      7353
weighted avg       0.65      0.67      0.64      7353



In [33]:
print(cm(y_valid_l, y_pred_l, labels=[-1,0,1]))

[[ 256  992   34]
 [ 208 3575  463]
 [  17  736 1072]]


**Run-IV**

In [34]:
from sklearn.neighbors import KNeighborsClassifier as knn

In [35]:
model = knn()

In [36]:
params = {'n_neighbors': [18], 'weights':['distance']}
clf_5 = GridSearchCV(model, params)
clf_5.fit(X_train, y_train)

In [37]:
print("Best parameters: ", clf_5.best_params_)
print("Best score: ", clf_5.best_score_)

Best parameters:  {'n_neighbors': 18, 'weights': 'distance'}
Best score:  0.6523530157347432


In [38]:
y_pred = clf_5.predict(X_valid)

In [39]:
y_pred_l = list(y_pred)
y_valid_l = list(y_valid)

In [40]:
print(cr(y_valid_l, y_pred_l, labels=[-1,0,1]))

              precision    recall  f1-score   support

          -1       0.51      0.14      0.22      1282
           0       0.66      0.87      0.75      4246
           1       0.68      0.52      0.59      1825

    accuracy                           0.65      7353
   macro avg       0.62      0.51      0.52      7353
weighted avg       0.64      0.65      0.62      7353



In [41]:
print(cm(y_valid_l, y_pred_l, labels=[-1,0,1]))

[[ 177 1072   33]
 [ 155 3688  403]
 [  15  866  944]]
