## Importing all necessary libararies

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
from sklearn.metrics import cohen_kappa_score
import pandas as pd
from sklearn.preprocessing import StandardScaler
from pythonFiles.ThesisUtility import printResults,readTextFromFileName,appendClustersToVector

model = SentenceTransformer('all-mpnet-base-v2', device="cuda")

### constants

In [None]:
# Set your path to all essays in the dataset for all prompts for all countriesl.
allEssaysPath = "D:\UDE\6th Semester\MEMS\MEWS Data\MEWS_Essays\MEWS_Essays\Essays_all\ALLEssaysAllPrompts"

### Building the right file names and reading the files

In [52]:
#reading the gold standards file provided from the MEWS
df = pd.read_csv(r"data\TRACE_Datensatz_transposed_220308.csv", sep='\t')
y = df['Code1_IN_AR_Gesamteindruck_Argumentation_3_4_0']  # y is the gold standards

listEssayNames = df['DocumentID']

listEssayNames = [i.removeprefix("00") for i in listEssayNames]

listEssayNames = [i.replace("al","aI") for i in listEssayNames]

#Building the complete filename from the provided csv sheet.
listRightNames = []
for essay in listEssayNames:
    x = essay.split("X")
    temp = x[0]+"_"+x[1]+".txt"
    listRightNames.append(temp)
    #print(x)

from pathlib import Path

essays =[] # the final content of each essay.
df_fileName = pd.DataFrame({})

#Reading the content of the essay given its name
for essayRigt in listRightNames:
    my_file = Path(allEssaysPath, essayRigt)
    if my_file.is_file():
        text = my_file.read_text(encoding='utf-8-sig')
        essays.append(text)

### CountVectorizer Implementation

In [None]:
essays_train, essays_test, y_train, y_test = train_test_split(essays, y, test_size=0.25, random_state=1000)

# buildig the vectorizer model and transforming the essays
vectorizer = CountVectorizer()
vectorizer.fit_transform(essays_train)
X_train = vectorizer.transform(essays_train)
X_test  = vectorizer.transform(essays_test)

#Trianing the classifier
classifier = LogisticRegression(max_iter=100)
classifier.fit(X_train, y_train)

#Evaluation
score1 = classifier.score(X_test, y_test)
pred = classifier.predict(X_test)

#print Evaluation results
printResults(score1,y_test,pred)

### Sbert - baseline experiment

In [None]:
essays_train, essays_test, y_train, y_test = train_test_split(essays, y, test_size=0.25, random_state=1000)

# buildig the vectorizer model and transforming the essays
X_train = model.encode(essays_train, show_progress_bar =True, device="cuda")
X_test  = model.encode(essays_test, show_progress_bar =True, device="cuda")

#Trianing the classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

#Evaluation
score1 = classifier.score(X_test, y_test)
pred = classifier.predict(X_test)

#prinitng the results
printResults(score1,y_test,pred)

### Tf-Idf vectorizer - baseline experiment

In [None]:
essays_train, essays_test, y_train, y_test = train_test_split(essays, y, test_size=0.25, random_state=1000)

# buildig the vectorizer model and transforming the essays
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
vectorizer.fit_transform(essays_train)
X_train = vectorizer.transform(essays_train)
X_test  = vectorizer.transform(essays_test)

#Trianing the classifier
classifier = SVC(kernel="linear")
classifier.fit(X_train, y_train)

#Evaluation
score1 = classifier.score(X_test, y_test)
pred = classifier.predict(X_test)

#prinitng the results
printResults(score1,y_test,pred)

### QWK between the first and the second rater

In [None]:
y1QWK = df['Code1_IN_AR_Gesamteindruck_Argumentation_3_4_0']
y2QWK = df['Code2_IN_AR_Gesamteindruck_Argumentation_3_4_0']

cohen_kappa_score(y1QWK,y2QWK,weights="quadratic")

### TFIDF with features Clusters

In [None]:
essaysNames_train, essaysNames_test, y_train, y_test = train_test_split(listRightNames, y, test_size=0.25, random_state=1000)


## getting train and testing data seperately .....
essays_train=readTextFromFileName(essaysNames_train,allEssaysPath)
essays_test =readTextFromFileName(essaysNames_test,allEssaysPath)


vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
vectorizer.fit_transform(essays_train)
X_train = vectorizer.transform(essays_train)
X_test  = vectorizer.transform(essays_test)
X_train_WithClusers = appendClustersToVector(essaysNames_train,X_train,"tfidf")
X_test_WithClusters = appendClustersToVector(essaysNames_test,X_test,"tfidf")

### classification logic #######################
classifier = LogisticRegression()
classifier.fit(X_train_WithClusers, y_train)

#Evaluation
score1 = classifier.score(X_test_WithClusters, y_test)
pred = classifier.predict(X_test_WithClusters)

#prinitng the results
printResults(score1,y_test,pred)

### Sbert with clusters as feature using normalized feature vector

In [None]:
essaysNames_train, essaysNames_test, y_train, y_test = train_test_split(listRightNames, y, test_size=0.25, random_state=1000)

## getting the training and testing essays seperately......
essays_train=readTextFromFileName(essaysNames_train,allEssaysPath)
essays_test =readTextFromFileName(essaysNames_test,allEssaysPath)

### getting the feature vectors and appending the clusters to them.
X_train = model.encode(essays_train, show_progress_bar =True, device="cuda")
X_test  = model.encode(essays_test, show_progress_bar =True, device="cuda")
X_train_WithClusers = appendClustersToVector(essaysNames_train,X_train,"sbert")
X_test_WithClusters = appendClustersToVector(essaysNames_test,X_test,"sbert")

### scaling the resulted feature vectors after appending the clusters to them..
scaler = StandardScaler()
scaler.fit(X_train_WithClusers)
X_TrainedScaled = scaler.fit_transform(X_train_WithClusers)
X_TestScaled = scaler.fit_transform(X_test_WithClusters)

### classification logic #######################
classifier = LogisticRegression() #LogisticRegression()  #DecisionTreeClassifier(max_depth=4)
classifier.fit(X_TrainedScaled, y_train)

#Evaluation
score1 = classifier.score(X_TestScaled, y_test)
pred = classifier.predict(X_TestScaled)

#prinitng the results
printResults(score1,y_test,pred)