In [4]:
import numpy as np
import pandas as pd

from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer



lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
tw_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

In [5]:
# ne vom folosi de un lemmatizer pentru a aduce cuvintele la forma din dictionar
# vom elimina toate stop-words urile
# pentru token-izare ne vom folosi de un tweer tokenizer

In [6]:
def get_data(filename):
    df = pd.read_csv(filename,delimiter = ',')      #citim dataset ul 
    tweets = df['tweet'].values     
    labels = df['label'].values
                                                                              # functie prin care
    shuffle_stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.2)    # impartim dataset ul in 80% date de train
                                                                              # si 20% date de test 
    for train_index, test_index in shuffle_stratified.split(tweets, labels):
        tweets_train, tweets_test = tweets[train_index], tweets[test_index]
        labels_train, labels_test = labels[train_index], labels[test_index]


        
    return tweets_train,labels_train,tweets_test, labels_test

In [7]:
tweets_train, y_train,tweets_test,y_test = get_data("train.csv")

In [8]:
filtered_tweets = []
for tweet in tweets_train:
    tokens = tw_tokenizer.tokenize(tweet)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    
    for token in filtered_tokens:
        token = lemmatizer.lemmatize(token)
    tweet = " ".join(filtered_tokens)
    filtered_tweets.append(tweet)
    


In [9]:
# vom tokeniza fiecare tweet, eliminand stop words-urile,apoi vom lemmatiza fiecare token, si unim tweet-ul inpoi
# insa va fi format din cuvintele prelucrate pana acum, vom avea o noua lista de tweet uri -> filtered_tweets

In [10]:
count_vectorizer = CountVectorizer(lowercase=True, analyzer='word', stop_words='english')

count_vectorizer.fit(filtered_tweets)    #se creeaza dictionarul de token-uri, se mapeaza fiecare token la o pozitie
X_train = count_vectorizer.transform(filtered_tweets)    #extragerea de token count-uri atat din train cat si din test
X_test = count_vectorizer.transform(tweets_test)

In [None]:
# vom converti colectia noastra de tweeturi intr-o matrice de token count-uri folosind count_vectorizer

In [53]:
# Vom folosi clasificatorul Naive Bayes 
model = MultinomialNB(alpha=0.01)    # initializarea modelului
model.fit(X_train, y_train)          # si antrenarea acestuia folosind matricea sparsa de train

predictions = model.predict(X_test)  # predictia

print(accuracy_score(y_test, predictions))  # vom afisa metricile de acuratete pentru modelul nostru
print(classification_report(y_test, predictions))


0.9596433599249179
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5945
           1       0.82      0.55      0.66       448

    accuracy                           0.96      6393
   macro avg       0.89      0.77      0.82      6393
weighted avg       0.96      0.96      0.96      6393

