# 1. Imports 

In [1]:
import pandas as pd
import re
from textblob import TextBlob
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score
import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

# 2. Text preprocessing

## 2.1. Loading the data

In [2]:
# Load the dataset
train_df = pd.read_csv('preprocessed_data/train_cleaned.csv')
test_df = pd.read_csv('preprocessed_data/test_cleaned.csv')

## 2.2. Text cleaning function

In [3]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess_text(text):
    # Correct spelling using TextBlob
    # text_blob = TextBlob(text)
    # text = str(text_blob.correct())
    
    # Keep only alphabets
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenization and Lemmatization using NLTK
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]    
    return ' '.join(tokens)

## 2.3. Applying the text cleaning function

In [4]:
def apply_preprocessing(df):
    df = df.copy()
    df['cleaned_text']    = df['text'].apply(preprocess_text)
    df['cleaned_subject'] = df['subject'].apply(preprocess_text)
    df['cleaned_title']   = df['title'].apply(preprocess_text)
    df['combined_text'] = df['cleaned_text'] + ' ' + df['cleaned_subject'] + ' ' + df['cleaned_title']
    return df[['combined_text']]

In [5]:
test_df_processed  = apply_preprocessing(test_df)
train_df_processed = apply_preprocessing(train_df)

## 2.4. Using TF-IDF to vectorize the text

In [6]:
# Vectorize the combined text fields using TF-IDF
tfidf = TfidfVectorizer(max_features=1000)

X = tfidf.fit_transform(train_df_processed['combined_text']).toarray()
Y = train_df['class']

X_test = tfidf.transform(test_df_processed['combined_text']).toarray()

# 3. Modeling 

## 3.1. Splitting the data

In [7]:
# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

## 3.2. Model 1 - Decision Tree

### 3.2.1. Grid search and building the model

In [8]:
# Define the parameter grid for the Decision Tree
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, 40, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search.best_params_}')

# Evaluate the best model on the validation data
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print('Accuracy on validation data:', val_accuracy)

# Train the best model using the combined training and validation data
best_model.fit(X, Y)

# Make predictions on the test data
y_test_pred = best_model.predict(X_test)

Best hyperparameters: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy on validation data: 0.998144253371273


### 3.2.2. Saving the model and results

In [9]:
# Save the test data predictions to CSV
output_df = pd.DataFrame({
    'ID': range(1, len(y_test_pred) + 1),
    'class': y_test_pred
})
output_df.to_csv('Results/predicted_test_data_DT.csv', index=False)

# Save the best model to a file
joblib.dump(best_model, 'Results/best_model_DT.pkl')

['Results/best_model_DT.pkl']

## 3.3. Model 2 - Naive Bayes

### 3.3.1. Grid search and building the model

In [10]:
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print(f'Best hyperparameters: {grid_search.best_params_}')

# Evaluate the best model on the validation data
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print('Accuracy on validation data:', val_accuracy)

# Train the best model using the combined training and validation data
best_model.fit(X, Y)

# Make predictions on the test data
y_test_pred = best_model.predict(X_test)

Best hyperparameters: {'alpha': 0.1}
Accuracy on validation data: 0.9444513175801064


### 3.3.2. Saving the model and results

In [11]:
# Save the test data predictions to CSV
output_df = pd.DataFrame({
    'ID': range(1, len(y_test_pred) + 1),
    'class': y_test_pred
})
output_df.to_csv('Results/predicted_test_data_NB.csv', index=False)

# Save the best model to a file
joblib.dump(best_model, 'Results/best_model_NB.pkl')

['Results/best_model_NB.pkl']