In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import re
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC,SVC,NuSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [4]:
train_tweets = pd.read_csv(r'train_tweets.csv')
test_tweets = pd.read_csv(r'test_tweets.csv')

In [5]:
def form_sentence(tweet):
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

In [6]:
def stop_words(tweet):
    tweet_list = [ele for ele in tweet.split() if ele != 'user']
    clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
    return clean_mess

In [7]:
def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        normalized_tweet = ' '.join(normalized_tweet)
        return normalized_tweet

In [8]:
def stemming(tweet):
    stemmer = PorterStemmer()
    stemmed_tweet = []
    for word in tweet:
        stemmed_word = stemmer.stem(word) 
        stemmed_tweet.append(stemmed_word)
    return stemmed_tweet

In [9]:
train_tweet = {}
train_tweet = pd.DataFrame(train_tweet)

In [10]:
tweet = []
label = train_tweets['label']
for i in range(len(train_tweets)):
    tweet.append(normalization(stemming(stop_words(form_sentence(train_tweets['tweet'][i])))))

In [11]:
train_tweet['id'] = train_tweets['id']
train_tweet.insert(1, "label", label, True)
train_tweet.insert(2, "tweet", tweet, True)

In [59]:
train_msg, test_msg, label_train, label_test = train_test_split(train_tweet['tweet'], train_tweet['label'], test_size=0.2)

In [60]:
pipeline = Pipeline([
            ('count', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('classifier',RandomForestClassifier()),
            ])

In [61]:
pipeline.fit(train_msg,label_train)
predictions = pipeline.predict(test_msg)
print(classification_report(predictions,label_test))
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      6126
           1       0.54      0.88      0.67       267

    accuracy                           0.96      6393
   macro avg       0.77      0.92      0.82      6393
weighted avg       0.98      0.96      0.97      6393

[[5924  202]
 [  33  234]]
0.9632410448928516


In [50]:
tweet = []
tweet.append("Donald trump lack the willpower")
tweet.append("Arpit is happy")
print(pipeline.predict(tweet))

[0 0]
