In [1]:

import pandas as pd
import nltk
from nltk.corpus import stopwords
import unicodedata
from nltk.tokenize import word_tokenize
import string

# Ensure you have the necessary NLTK datasets downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data_path = 'word-differences.csv'  # Update this to the path of your uploaded file
df = pd.read_csv(data_path)

# Function to clean text: removes stopwords, punctuation, pronunciation marks, and converts to lowercase
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Remove punctuation
    tokens = [word for word in tokens if word.isalpha()]
    # Remove pronunciation marks and combine into a single string
    cleaned_text = ' '.join(tokens)
    cleaned_text = unicodedata.normalize('NFKD', cleaned_text).encode('ASCII', 'ignore').decode('utf-8')
    return cleaned_text

# Apply the cleaning function to the text column
df['cleaned_text'] = df['Differences'].apply(clean_text)

print(df[['Differences', 'cleaned_text']])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vigneshkrishnan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vigneshkrishnan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                            Differences  \
0     its they ionosphere, any nature's firmament ro...   
1     secured faces rebounds but her, lies offensive...   
2     differing ideologies stand favor identifying w...   
3     any within. to revered akin calmness bed, othe...   
4     crucible facing battles force interconnected w...   
...                                                 ...   
9202  surprise. then yonder might, sun feedin' plant...   
9203  if struck fear What notes say Johnny, to was t...   
9204  cryptic so close grand, wisdom This Though pro...   
9205  separated, extend. sight. hide. pass step stru...   
9206  crystal they how stand we freedom steers. No w...   

                                           cleaned_text  
0     ionosphere nature firmament roaring rains sigh...  
1     secured faces rebounds lies offensive upon cou...  
2     differing ideologies stand favor identifying p...  
3     within revered akin calmness bed grasping sens...  
4

In [6]:
# Testing out the pre processed data


import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Selecting 'Differences' as input and 'Labels_OG' as output
X = df['cleaned_text'].astype(str)  # Convert to string to ensure text processing
y = df['Labels_OG']

# Encoding the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Initializing the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=10000)  # Limiting to 1000 features for simplicity

# Fitting TF-IDF to the data and transforming our text column into TF-IDF vectors
X_tfidf = tfidf.fit_transform(X).toarray()

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

# Initialize the CatBoostClassifier
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.01,
    depth=6,
    loss_function='MultiClass',
    verbose=True,  # Set to True to see CatBoost's training output
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
# Evaluate the model with various metrics
precision = precision_score(y_test, predictions, average='weighted')  # 'weighted' accounts for label imbalance
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
conf_matrix = confusion_matrix(y_test, predictions)

# Output the metrics
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)


0:	learn: 4.5927149	total: 4.08s	remaining: 6m 44s
1:	learn: 4.5914125	total: 8.23s	remaining: 6m 43s
2:	learn: 4.5899953	total: 12.2s	remaining: 6m 35s
3:	learn: 4.5885216	total: 16s	remaining: 6m 25s
4:	learn: 4.5863067	total: 20s	remaining: 6m 20s
5:	learn: 4.5858766	total: 24s	remaining: 6m 16s
6:	learn: 4.5845670	total: 28.3s	remaining: 6m 15s
7:	learn: 4.5830389	total: 32.2s	remaining: 6m 10s
8:	learn: 4.5808699	total: 36.1s	remaining: 6m 5s
9:	learn: 4.5799482	total: 40.1s	remaining: 6m
10:	learn: 4.5784808	total: 44.1s	remaining: 5m 57s
11:	learn: 4.5764541	total: 48.2s	remaining: 5m 53s
12:	learn: 4.5762635	total: 52.1s	remaining: 5m 48s
13:	learn: 4.5755371	total: 56.1s	remaining: 5m 44s
14:	learn: 4.5742595	total: 1m	remaining: 5m 41s
15:	learn: 4.5736545	total: 1m 4s	remaining: 5m 37s
16:	learn: 4.5724749	total: 1m 8s	remaining: 5m 33s
17:	learn: 4.5706111	total: 1m 12s	remaining: 5m 29s
18:	learn: 4.5694426	total: 1m 16s	remaining: 5m 25s
19:	learn: 4.5680646	total: 1m 20s

  _warn_prf(average, modifier, msg_start, len(result))
