In [1]:
import numpy as np
import json
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Construct the Balanced Dataset

In [2]:
df_comb = pd.read_csv('balanced_data.csv')
X = df_comb['text']
y =  df_comb['label']


# Tf-Idf Preprocessing

In [3]:
# define a function for tfidf for the convenience of changing: 
# input text (in training set, validation and test); max features needed; and ngrams
def tfidf(cleanedText_train, cleanedText_test, maxFeatures = None, ngram = 1):
    '''
    This function take string form cleaned Text and process it to Tf-Idf sparse matrix
    '''
    vectorizer = TfidfVectorizer(max_features = maxFeatures,ngram_range = (ngram,ngram))
    trained_vec = vectorizer.fit(cleanedText_train)
    tfidf = trained_vec.transform(cleanedText_test)
    # check the names of terms
    #display(vectorizer.get_feature_names_out())
    return tfidf 

# Convert words to tfidf matrix
tfidf_vectors = tfidf(cleanedText_train = X, cleanedText_test = X, maxFeatures = None, ngram = 1)

# Train test Split

In [4]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors, y, test_size=0.2, random_state=42)

# Logistic Regression

## Train Model

In [5]:
model_lgr = LogisticRegression(class_weight='balanced', max_iter=1000)
model_lgr.fit(X_train, y_train)

LogisticRegression(class_weight='balanced', max_iter=1000)

## Test Model

In [6]:
# Make predictions on the test data
predictions = model_lgr.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.8930438842203549


# SVM 

## Train Model

In [7]:
model_svm = SVC(class_weight='balanced')  
model_svm.fit(X_train, y_train)

SVC(class_weight='balanced')

## Test Model

In [8]:
svm_predictions = model_svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("SVM Accuracy:", svm_accuracy)

SVM Accuracy: 0.9661064425770308


# Neural Network: MLP

## Train Model

In [9]:
model_mlp = MLPClassifier()
model_mlp.fit(X_train, y_train)

MLPClassifier()

In [10]:
mlp_predictions = model_mlp.predict(X_test)
mlp_accuracy = accuracy_score(y_test, mlp_predictions)
print("MLP Accuracy:", mlp_accuracy)

MLP Accuracy: 0.9629318394024277


# Send to Kaggle csv

## Read the Test data

In [11]:
file_path_test = 'data/test_set.json'
df_test = pd.read_json(file_path_test, lines=True)

# to convert dataframe to list
X_Kaggle = df_test['text'].to_list()
X_Kaggle = [" ".join(map(str, x)) for x in X_Kaggle]

## Preprocessing Tfidf

In [12]:
tfidf_vectors_Kaggle = tfidf(X, X_Kaggle)

## Predict using Trained LGR

In [13]:
predictions_lgr = model_lgr.predict(tfidf_vectors_Kaggle)

# # Add predictions to the test DataFrame
df_test['class'] = predictions_lgr

# # Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# # Save the selected columns to a CSV file
df_test[selected_columns].to_csv('prediction/tfidf_lgr.csv', index=False)

## Predict using Trained SVM

In [14]:
predictions_svm = model_svm.predict(tfidf_vectors_Kaggle)

# # Add predictions to the test DataFrame
df_test['class'] = predictions_svm

# # Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# # Save the selected columns to a CSV file
df_test[selected_columns].to_csv('prediction/tfidf_svm.csv', index=False)


## Predict using Trained MLP

In [15]:
predictions_mlp = model_mlp.predict(tfidf_vectors_Kaggle)

# # Add predictions to the test DataFrame
df_test['class'] = predictions_mlp

# # Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# # Save the selected columns to a CSV file
df_test[selected_columns].to_csv('prediction/tfidf_mlp.csv', index=False)