In [1]:
import pandas as pd
import os as os
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.svm import LinearSVC

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('sentencizer')

train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

def data_cleaning(sentence):
    doc=nlp(sentence)
    
    tokens=[]
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp=token.lemma_.lower().strip()
        else:
            temp=token.lower_
        tokens.append(temp)
        
    clean_tokens=[]
    for token in tokens:
        if token not in STOP_WORDS and token not in string.punctuation:
            clean_tokens.append(token)
    return clean_tokens

tfidf=TfidfVectorizer(tokenizer=data_cleaning)
classifier=LinearSVC()

X=train_df['review']
y=train_df['response']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

tfidf.fit(x_train)
tfidf_train=tfidf.transform(x_train)
tfidf_test=tfidf.transform(x_test)

clf=classifier.fit(tfidf_train, y_train)
y_pred=clf.predict(tfidf_test)

print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           1       0.89      0.84      0.86      1518
           2       0.84      0.89      0.87      1465

    accuracy                           0.87      2983
   macro avg       0.87      0.87      0.87      2983
weighted avg       0.87      0.87      0.87      2983



In [None]:
from sklearn.model_selection import GridSearchCV

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=data_cleaning)),
    ('clf', LinearSVC())
])

# Define the hyperparameters to tune
hyperparameters = {
    'tfidf__max_features': [1000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1, 10]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(pipeline, hyperparameters, cv=5)

# Fit the GridSearchCV object to the data
grid_search.fit(X, y)

# Print the best parameters and best score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)



In [None]:
test_df_transformed = tfidf.transform(test_df['review'])
y_test_pred = clf.predict(test_df_transformed)

In [None]:
export_pred = pd.DataFrame(y_test_pred, columns=['response'])
export_pred.to_csv('test10.txt', index=False, header=False)