In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [19]:
# Load Word2Vec model
word2vec_model = Word2Vec.load("word2vec.model")



In [20]:
# Load training and testing datasets
train_data = pd.read_csv('project_train.csv')  # Update with your training data
test_data = pd.read_csv('project_test.csv')    # Update with your testing data



In [21]:
# Preprocessing for Word2Vec
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

X_train_seq = tokenizer.texts_to_sequences(train_data['text'])
X_train_seq = pad_sequences(X_train_seq, maxlen=100)  # Adjust maxlen as needed
y_train = train_data['category']

X_test_seq = tokenizer.texts_to_sequences(test_data['text'])
X_test_seq = pad_sequences(X_test_seq, maxlen=100)    # Adjust maxlen as needed
y_test = test_data['category']



In [22]:
# Vectorize sentences using Word2Vec embeddings
def vectorize_sentence(sentence):
    vectors = []
    for word in sentence.split():
        if word in word2vec_model.wv:
            vectors.append(word2vec_model.wv[word])
    if vectors:
        return sum(vectors) / len(vectors)
    else:
        return [0] * word2vec_model.vector_size

X_train_word2vec = [vectorize_sentence(sentence) for sentence in train_data['text']]
X_test_word2vec = [vectorize_sentence(sentence) for sentence in test_data['text']]



In [23]:
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()



In [24]:
# Fit and transform on training data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['text'])



In [25]:
# Transform testing data
X_test_tfidf = tfidf_vectorizer.transform(test_data['text'])



In [9]:
# Print TF-IDF values
feature_names = tfidf_vectorizer.get_feature_names_out()
for doc_index, doc in enumerate(train_data['text']):
    print(f"TF-IDF values for document {doc_index}:")
    feature_index = X_train_tfidf[doc_index, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_train_tfidf[doc_index, x] for x in feature_index])
    for word_index, score in tfidf_scores:
        print(f"    {feature_names[word_index]}: {score}")
    print()



TF-IDF values for document 0:
    ജവയത: 0.475833735775111
    വന: 0.2355884688348658
    അല: 0.4986292108455628
    ഒര: 0.35230493401248075
    പല: 0.3448372495984739
    പലദ: 0.475833735775111

TF-IDF values for document 1:
    അട: 0.22995113820259966
    ളവർ: 0.3277147529969599
    എന: 0.18637028897191818
    ളത: 0.30248575602018407
    ഉള: 0.5904617746920442
    ർക: 0.3321585940342163
    ടന: 0.256399895708316
    ഏട: 0.28008805834523487
    ഓണ: 0.3321585940342163

TF-IDF values for document 2:
    ആട: 0.21809000630532271
    mech: 0.1973075185469681
    royal: 0.2106220356268274
    ആണ: 0.2596909527115941
    തല: 0.10202864940599596
    ആരണ: 0.8895479830583208

TF-IDF values for document 3:
    manual: 0.42019380514138277
    midhun: 0.35989663330587995
    khalid: 0.42019380514138277
    shaiju: 0.45181448226589344
    syam: 0.4333175719006434
    sushin: 0.3541868166287921

TF-IDF values for document 4:
    ej: 1.0

TF-IDF values for document 5:
    kaanum: 0.3858646000366381
   

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



    upakarikkum: 0.1802988400761012
    abhinethakkalk: 0.1802988400761012
    labhchillenkil: 0.1802988400761012
    pratheeshicha: 0.1802988400761012
    nannayirikku: 0.1802988400761012
    irangiyale: 0.1802988400761012
    finished: 0.1802988400761012
    aakkuo: 0.1802988400761012
    aakkiyath: 0.1802988400761012
    okay: 0.16714181211860724
    pratheeshikkunnu: 0.17060256314994526
    problem: 0.17060256314994526
    avasarangal: 0.17483818009468108
    aayalum: 0.1539847841611132
    poomaram: 0.15744553519245125
    kittan: 0.15563639748621944
    aayitt: 0.15744553519245125
    vijayam: 0.1539847841611132
    kooduthal: 0.12440873902031598
    magic: 0.1497491672163774
    action: 0.1302053649224869
    ithiri: 0.15944544414253334
    avasanam: 0.14526213495388382
    polum: 0.1276707282461252
    aaya: 0.13723564423980106
    aanu: 0.10155543413666603
    nalla: 0.09792890406915289
    okke: 0.11471246209416003
    illa: 0.09894601766961238
    shine: 0.13536709622219906


In [26]:
# Combine Word2Vec and TF-IDF features
X_train_combined = np.concatenate((X_train_word2vec, X_train_tfidf.toarray()), axis=1)
X_test_combined = np.concatenate((X_test_word2vec, X_test_tfidf.toarray()), axis=1)



In [27]:
# Train Random Forest classifier
rf_classifier = RandomForestClassifier()


In [28]:
rf_classifier.fit(X_train_combined, y_train)



In [29]:
# Evaluate the classifier
predictions = rf_classifier.predict(X_test_combined)



In [30]:
# Calculate precision, recall, and F1-score
report = classification_report(y_test, predictions)
print("Classification Report:")
print(report)


Classification Report:
                                      precision    recall  f1-score   support

                       Not_offensive       0.98      1.00      0.99      1765
     Offensive_Targeted_Insult_Group       1.00      0.43      0.61        23
Offensive_Targeted_Insult_Individual       1.00      0.56      0.71        27
               Offensive_Untargetede       1.00      0.52      0.68        29

                            accuracy                           0.98      1844
                           macro avg       0.99      0.63      0.75      1844
                        weighted avg       0.98      0.98      0.98      1844



Perceptron classifier

In [33]:
from sklearn.linear_model import Perceptron

In [34]:
perceptron_classifier = Perceptron()
perceptron_classifier.fit(X_train_combined, y_train)



In [35]:
# Evaluate the classifier
predictions = perceptron_classifier.predict(X_test_combined)



In [36]:
# Calculate precision, recall, and F1-score
report = classification_report(y_test, predictions)
print("Classification Report:")
print(report)

Classification Report:
                                      precision    recall  f1-score   support

                       Not_offensive       0.99      0.99      0.99      1765
     Offensive_Targeted_Insult_Group       0.57      0.52      0.55        23
Offensive_Targeted_Insult_Individual       0.74      0.63      0.68        27
               Offensive_Untargetede       0.89      0.59      0.71        29

                            accuracy                           0.98      1844
                           macro avg       0.80      0.68      0.73      1844
                        weighted avg       0.98      0.98      0.98      1844



AdaBoost Classifier

In [44]:
from sklearn.ensemble import AdaBoostClassifier

In [45]:
adaboost_classifier = AdaBoostClassifier()
adaboost_classifier.fit(X_train_combined, y_train)




In [46]:
predictions = adaboost_classifier.predict(X_test_combined)

In [47]:
report = classification_report(y_test, predictions)
print("Classification Report:")
print(report)

Classification Report:
                                      precision    recall  f1-score   support

                       Not_offensive       0.97      0.99      0.98      1765
     Offensive_Targeted_Insult_Group       0.60      0.26      0.36        23
Offensive_Targeted_Insult_Individual       0.00      0.00      0.00        27
               Offensive_Untargetede       0.38      0.21      0.27        29

                            accuracy                           0.96      1844
                           macro avg       0.49      0.37      0.40      1844
                        weighted avg       0.94      0.96      0.95      1844



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
