## Importing all necessary libararies

In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
from sklearn.metrics import cohen_kappa_score
import pandas as pd
from sklearn.preprocessing import StandardScaler
from Utility.ThesisUtility import printResults,readTextFromFileName,appendClustersToVector

### constants

In [11]:
# Set your path to all essays in the dataset for all prompts for all countriesl.
allEssaysPath = "../Dataset/Essays_all/ALLEssaysAllPrompts"

# the path to the excel file that contains the final resutls from the clustering experiment
finalResultsPath ="data/FinalResults.xlsx"

#Sbert model
model = SentenceTransformer('all-mpnet-base-v2', device="cuda")

### Building the right file names and reading the files

In [3]:
#reading the gold standards file provided from the MEWS
df = pd.read_csv("data/TRACE_Datensatz_transposed_220308.csv", sep='\t')
y = df['Code1_IN_AR_Gesamteindruck_Argumentation_3_4_0']  # y is the gold standards

listEssayNames = df['DocumentID']

listEssayNames = [i.removeprefix("00") for i in listEssayNames]

listEssayNames = [i.replace("al","aI") for i in listEssayNames]

#Building the complete filename from the provided csv sheet.
listRightNames = []
for essay in listEssayNames:
    x = essay.split("X")
    temp = x[0]+"_"+x[1]+".txt"
    listRightNames.append(temp)
    #print(x)

from pathlib import Path

essays =[] # the final content of each essay.
df_fileName = pd.DataFrame({})

#Reading the content of the essay given its name
for essayRigt in listRightNames:
    my_file = Path(allEssaysPath, essayRigt)
    if my_file.is_file():
        text = my_file.read_text(encoding='utf-8-sig')
        essays.append(text)

### CountVectorizer Implementation

In [4]:
essays_train, essays_test, y_train, y_test = train_test_split(essays, y, test_size=0.25, random_state=1000)

# buildig the vectorizer model and transforming the essays
vectorizer = CountVectorizer()
vectorizer.fit_transform(essays_train)
X_train = vectorizer.transform(essays_train)
X_test  = vectorizer.transform(essays_test)

#Trianing the classifier
classifier = LogisticRegression(max_iter=100)
classifier.fit(X_train, y_train)

#Evaluation
score1 = classifier.score(X_test, y_test)
pred = classifier.predict(X_test)

#print Evaluation results
printResults(score1,y_test,pred)

Accuracy: 0.3228621291448517
ِAdjecent Accuracy: 0.7678883071553229
pearson correaltion 1  (0.2193323781004512, 1.1360469950967985e-07)
weighted kappa:  0.21886867734795923
[[ 0  0  3  3  2  0  0]
 [ 0 12 24 23  6  0  0]
 [ 0 22 44 48 30  5  0]
 [ 2 21 71 93 43  4  1]
 [ 0  4 17 39 33  4  1]
 [ 0  1  3  5  4  3  0]
 [ 0  0  0  0  2  0  0]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Sbert - baseline experiment

In [5]:
essays_train, essays_test, y_train, y_test = train_test_split(essays, y, test_size=0.25, random_state=1000)

# buildig the vectorizer model and transforming the essays
X_train = model.encode(essays_train, show_progress_bar =True, device="cuda")
X_test  = model.encode(essays_test, show_progress_bar =True, device="cuda")

#Trianing the classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

#Evaluation
score1 = classifier.score(X_test, y_test)
pred = classifier.predict(X_test)

#prinitng the results
printResults(score1,y_test,pred)

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Accuracy: 0.41012216404886565
ِAdjecent Accuracy: 0.8394415357766143
pearson correaltion 1  (0.2260998151449603, 4.4640303139982e-08)
weighted kappa:  0.1740535968522976
[[  0   0   4   3   1   0   0]
 [  0   0  19  46   0   0   0]
 [  0   0  34 103  12   0   0]
 [  0   1  31 183  20   0   0]
 [  0   0   9  71  18   0   0]
 [  0   0   0  14   2   0   0]
 [  0   0   0   2   0   0   0]]


### Tf-Idf vectorizer - baseline experiment

In [6]:
essays_train, essays_test, y_train, y_test = train_test_split(essays, y, test_size=0.25, random_state=1000)

# buildig the vectorizer model and transforming the essays
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
vectorizer.fit_transform(essays_train)
X_train = vectorizer.transform(essays_train)
X_test  = vectorizer.transform(essays_test)

#Trianing the classifier
classifier = SVC(kernel="linear")
classifier.fit(X_train, y_train)

#Evaluation
score1 = classifier.score(X_test, y_test)
pred = classifier.predict(X_test)

#prinitng the results
printResults(score1,y_test,pred)

Accuracy: 0.40663176265270506
ِAdjecent Accuracy: 0.8481675392670157
pearson correaltion 1  (0.2024922409462375, 1.022411070788281e-06)
weighted kappa:  0.14987146409625973
[[  0   0   4   4   0   0   0]
 [  0   0  20  43   2   0   0]
 [  0   0  40 103   6   0   0]
 [  0   0  39 187   9   0   0]
 [  0   0  11  81   6   0   0]
 [  0   0   0  15   1   0   0]
 [  0   0   0   1   1   0   0]]


### QWK between the first and the second rater

In [7]:
y1QWK = df['Code1_IN_AR_Gesamteindruck_Argumentation_3_4_0']
y2QWK = df['Code2_IN_AR_Gesamteindruck_Argumentation_3_4_0']

cohen_kappa_score(y1QWK,y2QWK,weights="quadratic")

0.3977300242869025

### TFIDF with features Clusters

In [12]:
essaysNames_train, essaysNames_test, y_train, y_test = train_test_split(listRightNames, y, test_size=0.25, random_state=1000)


## getting train and testing data seperately .....
essays_train=readTextFromFileName(essaysNames_train,allEssaysPath)
essays_test =readTextFromFileName(essaysNames_test,allEssaysPath)


vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
vectorizer.fit_transform(essays_train)
X_train = vectorizer.transform(essays_train)
X_test  = vectorizer.transform(essays_test)
X_train_WithClusers = appendClustersToVector(essaysNames_train,X_train,"tfidf",finalResultsPath)
X_test_WithClusters = appendClustersToVector(essaysNames_test,X_test,"tfidf",finalResultsPath)

### classification logic #######################
classifier = LogisticRegression()
classifier.fit(X_train_WithClusers, y_train)

#Evaluation
score1 = classifier.score(X_test_WithClusters, y_test)
pred = classifier.predict(X_test_WithClusters)

#prinitng the results
printResults(score1,y_test,pred)

Accuracy: 0.37521815008726006
ِAdjecent Accuracy: 0.8516579406631762
pearson correaltion 1  (0.26751836587620426, 7.579791244585139e-11)
weighted kappa:  0.2417616892911011
[[  1   0   4   3   0   0   0]
 [  1   3  26  32   3   0   0]
 [  1   5  49  84  10   0   0]
 [  0   3  50 151  31   0   0]
 [  0   1  12  75  10   0   0]
 [  0   0   0  14   1   1   0]
 [  0   0   0   2   0   0   0]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Sbert with clusters as feature using normalized feature vector

In [14]:
essaysNames_train, essaysNames_test, y_train, y_test = train_test_split(listRightNames, y, test_size=0.25, random_state=1000)

## getting the training and testing essays seperately......
essays_train=readTextFromFileName(essaysNames_train,allEssaysPath)
essays_test =readTextFromFileName(essaysNames_test,allEssaysPath)

### getting the feature vectors and appending the clusters to them.
X_train = model.encode(essays_train, show_progress_bar =True, device="cuda")
X_test  = model.encode(essays_test, show_progress_bar =True, device="cuda")
X_train_WithClusers = appendClustersToVector(essaysNames_train,X_train,"sbert",finalResultsPath)
X_test_WithClusters = appendClustersToVector(essaysNames_test,X_test,"sbert",finalResultsPath)

### scaling the resulted feature vectors after appending the clusters to them..
scaler = StandardScaler()
scaler.fit(X_train_WithClusers)
X_TrainedScaled = scaler.fit_transform(X_train_WithClusers)
X_TestScaled = scaler.fit_transform(X_test_WithClusters)

### classification logic #######################
classifier = LogisticRegression() #LogisticRegression()  #DecisionTreeClassifier(max_depth=4)
classifier.fit(X_TrainedScaled, y_train)

#Evaluation
score1 = classifier.score(X_TestScaled, y_test)
pred = classifier.predict(X_TestScaled)

#prinitng the results
printResults(score1,y_test,pred)

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Accuracy: 0.3403141361256545
ِAdjecent Accuracy: 0.787085514834206
pearson correaltion 1  (0.2685690644036752, 6.350139877975657e-11)
weighted kappa:  0.2685057086165018
[[  1   2   4   1   0   0   0]
 [  2   6  29  19   7   1   1]
 [  2  24  51  50  16   6   0]
 [  0  25  55 104  50   1   0]
 [  1   3  21  35  32   6   0]
 [  0   0   2  12   1   1   0]
 [  0   0   0   0   0   2   0]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
