In [1]:
#Importing the Libraries
import pandas as pd
dataset=pd.read_csv("E:/final_dataset_basicmlmodel.csv")

In [2]:
dataset.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
#Understanding the tweets

for index,tweet in enumerate(dataset["tweet"][10:15]):
    print(index+1,".",tweet)
    

1 .  â #ireland consumer price index (mom) climbed from previous 0.2% to 0.5% in may   #blog #silver #gold #forex
2 . we are so selfish. #orlando #standwithorlando #pulseshooting #orlandoshooting #biggerproblems #selfish #heabreaking   #values #love #
3 . i get to see my daddy today!!   #80days #gettingfed
4 . ouch...junior is angryð#got7 #junior #yugyoem   #omg 
5 . i am thankful for having a paner. #thankful #positive     


Note :- Noise present in Tweets

We can see that there are many hashtags present in the tweets of the form # symbol followed by text. We particularly don't need the # symbol so we will clean it out.
Also, there are strange symbols like â and ð in tweet 4. This is actually unicode characters that is present in our dataset that we need to get rid of because they don't particularly add anything meaningful.
There are also numerals and percentages 

# Data Cleaning

In [4]:
import re

def clean_text(text):
    
    text=re.sub(r'[^a-zA-Z\']', ' ', text)
    
    #Remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    #Convert to lowercase to maintain consistency
    text = text.lower()
       
    return text



In [5]:
dataset['clean_text']=dataset.tweet.apply(lambda x: clean_text(x))


# Feature Engineering

In [6]:
from wordcloud import STOPWORDS

print(STOPWORDS)

{'with', 'been', "didn't", 'same', "you've", 'r', 'because', 'few', 'was', 'herself', 'against', "aren't", "doesn't", 'it', 'some', 'from', 'at', 'them', "weren't", 'did', 'he', 'as', 'once', 'other', 'is', "we'll", 'your', 'itself', 'http', 'out', 'themselves', 'myself', "what's", 'himself', 'ourselves', 'nor', 'should', 'how', 'under', 'ours', 'after', 'would', 'his', "here's", 'which', 'by', 'her', 'any', 'about', "they've", "isn't", 'between', "couldn't", "they'd", "they're", "i've", "won't", "wouldn't", 'not', 'she', "it's", 'for', "don't", 'above', 'during', 'such', 'too', 'had', "you'll", "they'll", "who's", "hadn't", 'also', 'only', 'what', "how's", 'when', 'that', 'com', 'an', "he'd", "shan't", 'so', "i'd", 'else', 'yours', 'these', 'very', "we're", "wasn't", 'my', 'their', 'just', "he'll", 'over', 'those', 'they', 'both', 'to', 'be', 'ought', 'cannot', 'like', "why's", 'all', 'down', 'where', 'this', 'on', "she's", 'below', "when's", 'again', 'have', 'our', 'you', 'however', 

In [7]:
#Generating word frequency

def gen_freq(text):
    #Create a list
    word_list=[]
    
    for tw_words in text.split():
        word_list.extend(tw_words)
        
    #Creating word frequency
    word_freq=pd.Series(word_list).value_counts()
    
    return word_freq
    
#To check if a negetive word is present
def any_neg(words):
    for word in words:
        if word in ['n', 'no', 'non', 'not'] or re.search(r"\wn't", word):
            return 1
    else:
        return 0
    
#To check if 100 rare words are present
    
def any_rare(words,rare_100):
    for word in words:
        if word in rare_100:
            return 1
    else:
            return 0
#Check whether prompt words are present
def is_question(words):
    for word in words:
        if word in ['when', 'what', 'how', 'why', 'who']:
            return 1
    else:
        return 0

In [8]:
word_freq = gen_freq(dataset.clean_text.str)
#100 most rare words in the dataset
rare_100 = word_freq[-100:]
#Number of words in a tweet
dataset['word_count'] = dataset.clean_text.str.split().apply(lambda x: len(x))
#Negation present or not
dataset['any_neg'] = dataset.clean_text.str.split().apply(lambda x: any_neg(x))
#Prompt present or not
dataset['is_question'] = dataset.clean_text.str.split().apply(lambda x: is_question(x))
#Any of the most 100 rare words present or not
dataset['any_rare'] = dataset.clean_text.str.split().apply(lambda x: any_rare(x, rare_100))
#Character count of the tweet
dataset['char_count'] = dataset.clean_text.apply(lambda x: len(x))

In [9]:
#Top 10 common words are
gen_freq(dataset.clean_text.str)[:10]

user    3351
the     1880
to      1497
a       1232
you      949
in       899
of       893
is       853
and      821
i        805
dtype: int64

In [10]:
dataset.head()

Unnamed: 0,id,label,tweet,clean_text,word_count,any_neg,is_question,any_rare,char_count
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is s...,18,0,1,0,102
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can't us...,19,1,0,0,122
2,3,0,bihday your majesty,bihday your majesty,3,0,0,0,21
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ...,12,0,0,0,86
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4,0,0,0,39


# Splitting the dataset into Train-Test split

In [46]:
from sklearn.model_selection import train_test_split

X = dataset[['word_count', 'any_neg', 'any_rare', 'char_count', 'is_question']]
y = dataset.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [47]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [48]:
model = LinearRegression()

In [49]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [50]:
r_sq = model.score(X_train, y_train)
print('coefficient of determination:', r_sq)
pred = model.predict(X_test)

coefficient of determination: 0.0171252008919


In [51]:
#Naive Bayes classifier from sklearn
from sklearn.naive_bayes import GaussianNB

#Initialize GaussianNB classifier
model = GaussianNB()
#Fit the model on the train dataset
model = model.fit(X_train, y_train)
#Make predictions on the test dataset
pred = model.predict(X_test)

In [52]:
#Evaluate the ML model
from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test, pred)*100, "%")


Accuracy: 58.2326764145 %
