In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yashc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yashc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yashc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
data = pd.read_csv('./IMSyPP_EN_YouTube_comments_train.csv')
data = data[['Text', 'Type']]
data.head()

Unnamed: 0,Text,Type
0,Corona virus Chaple Hill NC lab 2015........ G...,0. appropriate
1,Really https://m.facebook.com/story.php?story_...,0. appropriate
2,just listening to his ugly voice makes me sick,2. offensive
3,-*-*-*- you can vote than for him....https://w...,0. appropriate
4,-*-*-*- @Cindy’s Journey i hope you see also t...,2. offensive


In [5]:
data["Type"].value_counts()
data = data[data['Type'] != '0']

In [7]:
data["Type"].value_counts()

0. appropriate      52989
2. offensive        45863
3. violent           2589
1. inappropriate     1739
Name: Type, dtype: int64

In [6]:
data_test= pd.read_csv('./IMSyPP_EN_YouTube_comments_evaluation_no_context.csv')
data_test = data_test[['Text', 'Type']]
data_test.dropna(inplace=True)
data_test.head()

Unnamed: 0,Text,Type
0,There wont be another stimulus.,0. appropriate
1,Thanks for sharing such a wonderful and useful...,0. appropriate
2,"Dave, check out Schrodinger (SDGR) in the phar...",0. appropriate
3,Why would you play striker cards? They provide...,0. appropriate
4,"Block Dragon is so broken honestly, why isn't ...",0. appropriate


In [9]:
data_test["Type"].value_counts()

0. appropriate      15934
2. offensive         5310
1. inappropriate      183
3. violent             76
Name: Type, dtype: int64

In [9]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21503 entries, 0 to 21517
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    21503 non-null  object
 1   Type    21503 non-null  object
dtypes: object(2)
memory usage: 504.0+ KB


In [7]:
# Preprocess the text data
data['Text'] = data['Text'].str.lower()
data=data.dropna()

# Split your dataset into training and testing sets
X_train_initial = data['Text']
y_train_initial  = data['Type']  # Assuming 'Type' contains labels (0: appropriate, 2: offensive)

X_test = data_test['Text']
y_test = data_test['Type']

In [8]:
# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Custom tokenizer function for stop words removal and lemmatization
def custom_tokenizer(text):
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

In [9]:
# Initialize the TF-IDF vectorizer to convert text data to numerical format
# Vectorize the text data using TF-IDF with stop words removal and lemmatization
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)
  # You can adjust the max_features as needed

X_train_tfidf = vectorizer.fit_transform(X_train_initial)

In [14]:
random_oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

# Fit and apply RandomOverSampler to the training data
X_train, y_train = random_oversampler.fit_resample(X_train_tfidf, y_train_initial)

In [15]:
y_train.value_counts()

0. appropriate      52989
2. offensive        52989
3. violent          52989
1. inappropriate    52989
Name: Type, dtype: int64

In [16]:

X_test_vectorized = vectorizer.transform(X_test)

In [17]:
# Train a Random Forest classifier
classifier = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
classifier.fit(X_train, y_train)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [29]:
# Classify new text
new_text = ["terrorist blasted a bomb"]
new_text_vectorized = vectorizer.transform(new_text)
predicted_label = classifier.predict(new_text_vectorized)
print("Predicted label:", predicted_label[0])

Predicted label: 2. offensive


In [27]:
# Evaluate model accuracy on the test set
y_pred = classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy:", accuracy)

Test set accuracy: 0.7852857740780356


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [31]:
# Calculate confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion_mat)

Confusion Matrix:
[[13987     4  1925    18]
 [   97     9    77     0]
 [ 2401    13  2887     9]
 [   51     0    22     3]]


In [33]:
# Calculate precision, recall, and F1-score for each class
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

Precision: 0.7752546757710246
Recall: 0.7852857740780356
F1-score: 0.7788420972871782


In [17]:
import pickle

In [14]:
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(classifier, file)

NameError: name 'classifier' is not defined

In [19]:
with open('random_forest_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [39]:
new_text = ["kill them pakistanis"]
new_text_vectorized = vectorizer.transform(new_text)
predicted_label = loaded_model.predict(new_text_vectorized)
print("Predicted label:", predicted_label[0])

Predicted label: 3. violent


In [11]:
custom_object = {
    'vectorizer': vectorizer,
    'tokenizer': custom_tokenizer,
    'nltk_stopwords': stopwords,
    'nltk_wordnet_lemmatizer': lemmatizer,
}

In [12]:
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(custom_object, file)

In [13]:
vectorizernew = TfidfVectorizer(stop_words='english')
  # You can adjust the max_features as needed

X_train_tfidf_new = vectorizernew.fit_transform(X_train_initial)

In [20]:
new_text = ["kill them pakistanis"]
new_text_vectorized = vectorizernew.transform(new_text)
predicted_label = loaded_model.predict(new_text_vectorized)
print("Predicted label:", predicted_label[0])

ValueError: X has 44350 features, but RandomForestClassifier is expecting 52890 features as input.