<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Importing-Libraries" data-toc-modified-id="Importing-Libraries-0.1">Importing Libraries</a></span></li><li><span><a href="#Importing-the-dataset" data-toc-modified-id="Importing-the-dataset-0.2">Importing the dataset</a></span></li><li><span><a href="#Cleaning-and-organizing-our-data" data-toc-modified-id="Cleaning-and-organizing-our-data-0.3">Cleaning and organizing our data</a></span><ul class="toc-item"><li><span><a href="#Distribution-of-Data---Target" data-toc-modified-id="Distribution-of-Data---Target-0.3.1">Distribution of Data - Target</a></span></li><li><span><a href="#Steps-to-clean-the-data" data-toc-modified-id="Steps-to-clean-the-data-0.3.2">Steps to clean the data</a></span></li></ul></li><li><span><a href="#Sentiments" data-toc-modified-id="Sentiments-0.4">Sentiments</a></span></li><li><span><a href="#Pre-process-Text" data-toc-modified-id="Pre-process-Text-0.5">Pre-process Text</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Remove-STOP-Words" data-toc-modified-id="Remove-STOP-Words-0.5.0.1">Remove STOP Words</a></span></li></ul></li></ul></li><li><span><a href="#Split-Data" data-toc-modified-id="Split-Data-0.6">Split Data</a></span></li><li><span><a href="#TF-IDF" data-toc-modified-id="TF-IDF-0.7">TF-IDF</a></span><ul class="toc-item"><li><span><a href="#CountVectorizer" data-toc-modified-id="CountVectorizer-0.7.1">CountVectorizer</a></span></li><li><span><a href="#Stemming" data-toc-modified-id="Stemming-0.7.2">Stemming</a></span></li><li><span><a href="#Tokenize" data-toc-modified-id="Tokenize-0.7.3">Tokenize</a></span></li></ul></li></ul></li><li><span><a href="#Exploratory-Data-Analysis-(EDA)" data-toc-modified-id="Exploratory-Data-Analysis-(EDA)-1">Exploratory Data Analysis (EDA)</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#TF-IDF-Vector" data-toc-modified-id="TF-IDF-Vector-1.0.1">TF IDF Vector</a></span></li><li><span><a href="#Positive" data-toc-modified-id="Positive-1.0.2">Positive</a></span></li></ul></li><li><span><a href="#N-grams" data-toc-modified-id="N-grams-1.1">N-grams</a></span></li></ul></li></ul></div>

### Importing Libraries

In [1]:
# Import Sklearn libraries to build models 
from sklearn.feature_extraction.text import TfidfVectorizer #  TF-IDF to vectorize words 
from sklearn.metrics import classification_report 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC


# Import Libraries to perform computation and do visualization. 
import pandas as pd
import numpy as np
np.random.seed(0)
import seaborn as sns
import matplotlib.pyplot as plt
import string

# Import nltk to check english lexicon.
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk import word_tokenize, FreqDist
from nltk import pos_tag # for Parts of Speech tagging
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.corpus import stopwords
# Generate wordcloud for word distribution visualization.
from wordcloud import WordCloud 

# Generating random numbers.
import random 


# Transforms text to a fixed-length vector of integers.
from gensim.models import Word2Vec 

#Efficient functions to search in strings.
import re as re 

# Import images for world cloud.
from PIL import Image, ImageDraw, ImageFont 


# Import Yellowbrick and vector coupon for visualization of frequent words
 
from yellowbrick.text import FreqDistVisualizer
from yellowbrick.datasets import load_hobbies

from os import path
from os import environ

In [2]:
from xgboost import XGBClassifier

In [None]:
# !pip install yellowbrick

### Importing the dataset

In [49]:
DATASET_COLUMNS=["sentiment", "ids", "date", "flag", "user", "tweet"]
DATASET_ENCODING = "ISO-8859-1"
df = pd.read_csv(r'D:\OneDrive - NITT\Custom_Download\training.1600000.processed.noemoticon.csv', encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

df.head()

Unnamed: 0,sentiment,ids,date,flag,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [87]:
ta_df = df.sample(50000,random_state=42)

In [None]:
ta_df.shape

In [None]:
ta_df.info()

In [None]:
ta_df.isna().mean()*100

In [None]:
ta_df['flag'].unique()

In [None]:
ta_df.drop(['flag','ids'], axis=1, inplace = True)


In [None]:
ta_df.head()

### Cleaning and organizing our data 

In [None]:
# Checking length 
print('length of data is', len(ta_df))

#### Distribution of Data - Target 

In [None]:
ta_df['sentiment'].unique()

In [None]:
ta_df['sentiment'].value_counts(normalize=True)*100

In [51]:

# Replacing the value 4 -->1 for ease of understanding.
# 0 = negative, 1 = positive
ta_df['sentiment'] = ta_df['sentiment'].replace(4,1)
ta_df.head()

Unnamed: 0,sentiment,ids,date,flag,user,tweet
431887,0,2064637119,Sun Jun 07 06:47:58 PDT 2009,NO_QUERY,bwness,My parents are replacing my bed cause it's so ...
12732,0,1551959701,Sat Apr 18 10:25:11 PDT 2009,NO_QUERY,GeorgiaPeach777,Waiting 4 Days Difference 2 call...and I have...
1248205,1,1995777536,Mon Jun 01 14:01:52 PDT 2009,NO_QUERY,JoannaAngel,a big salad on my neck would be pretty sweet! ...
1333759,1,2016517120,Wed Jun 03 06:54:56 PDT 2009,NO_QUERY,shortarican,Just getting to work I got alot on my plate to...
869248,1,1678091354,Sat May 02 05:27:32 PDT 2009,NO_QUERY,TreeFalldesign,"@uhandbag no problem, it's nothing important"


In [None]:
# Plot the count plot for the target labels.


p = sns.countplot(data = ta_df, y = 'sentiment', palette="Set3") #Setting p to plot of Emotion
p.set(xlabel = 'count') #Labling X
p.set(ylabel = 'sentiment') #Labling Y
p.set(title = "Count of Tweets per Sentiment")

In [None]:
print('Count of columns in the data is:  ', len(ta_df.columns))
print('Count of rows in the data is:  ', len(ta_df))

#### Steps to clean the data

* Checking and handling NaN values
* Drop duplicate
* Cleaning up product names in order to unify brand names


In [None]:
ta_df.dropna(inplace=True)

In [None]:
len(ta_df.duplicated(keep='last'))

In [None]:
ta_df.drop_duplicates()
ta_df.info()

In [None]:
# Checking for Null values.  We use the heatmap code which shows the contrast well.

sns.heatmap(ta_df.isnull(), cbar=False)
plt.title("NaNs")
plt.xlabel('Culomns')
plt.ylabel('Row')
plt.show()

In [None]:
# Removing duplicate values
ta_df.drop_duplicates(inplace = True) 

### Sentiments

In [None]:
ta_df.head(5)

In [None]:
# Creating an independent copy  

ta_df_copy = ta_df.copy()

### Pre-process Text

We will use text processing to allow the data to be more digestible for model use later in this project. This is an integral step in Natural Language Processing (NLP). 


The Preprocessing steps taken are:


* Removing Stopwords: Stopwords are common words used in the English language and do not add meaning to the sentences. Therefore we can remove them without sacrificing the meaning of the sentence. 

* Removing Words  with 2 letters: Words with length less than 2 are removed.

* Converting to lower case letters: Each text will be transferred to a lower case letter.

* Replacing http with space: Links starting with "http" or "https" or "www" are replaced by " 
".

* Lemmetizing 

In [None]:
def clean_tweet(tweet):
    
    tweet= tweet.lower()
    
    # Remove user_name @
    tweet = re.sub('(@[a-z0-9]+)\w+', '', tweet)
    
    tweet = re.sub('(#[a-z0-9]+)\w+', '', tweet)
    
    # Remove all non alphabets
    tweet = re.sub('([^0-9a-z \t])', '', tweet)
    
    # Remove hyperlinks
    tweet = re.sub(r'http\S+', '', tweet)
    
    # Remove RT
    tweet = re.sub(r'^rt[\s]+', '', tweet)
    
    return tweet

In [None]:
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [52]:
from nltk.stem import WordNetLemmatizer

In [53]:
wordLemm = WordNetLemmatizer()

In [54]:
wordLemm.lemmatize('abaci')

'abacus'

In [55]:
ps = PorterStemmer()

In [56]:
ps.stem('playing')

'play'

In [57]:
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# Replace 3 or more consecutive letters by 2 letter.
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

stop_words = set(stopwords.words("english"))

def preprocess(text):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    text = re.sub(sequencePattern, seqReplacePattern, text)
    
    tokens = []
    for token in text.split():
        if token not in stop_words:
            tokens.append(ps.stem(token))
    return " ".join(tokens)

In [58]:
ta_df['tweet'][0]

KeyError: 0

In [None]:
preprocess(ta_df['tweet'][0])

In [59]:
from tqdm.notebook import tqdm

In [60]:
tqdm.pandas()

In [61]:
ta_df['clean_tweet'] = ta_df['tweet'].progress_apply(preprocess)

  0%|          | 0/50000 [00:00<?, ?it/s]

In [None]:
# ta_df.drop('clean_tweets',axis=1,inplace=True)

In [None]:
ta_df.head()

##### Remove STOP Words

In [None]:
# def stop_stem_token_tweet(x):
#     temp = x.split() #1 tokenization
#     stemmed = [ps.stem(token) for token in temp if token not in stop_words]
#     return ' '.join(stemmed)

In [None]:
# complete tf-df
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# countvectorizer
# plot hashtags, usename

### Split Data 

In [62]:
X = ta_df['clean_tweet']

y = ta_df['sentiment']

In [63]:
list(set(ta_df['sentiment']))

[0, 1]

In [64]:
y

431887     0
12732      0
1248205    1
1333759    1
869248     1
          ..
828369     1
927150     1
1467597    1
1052103    1
1055141    1
Name: sentiment, Length: 50000, dtype: int64

In [65]:
ta_df['sentiment'].value_counts()

1    25081
0    24919
Name: sentiment, dtype: int64

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 43)

In [67]:
# Checking shape of train and test data

print(X_train.shape, y_train.shape) 

print(X_test.shape , y_test.shape) 


(37500,) (37500,)
(12500,) (12500,)


In [68]:
### Visulization

### TF-IDF

In [69]:
vectoriser_tfidf = TfidfVectorizer()

In [70]:
vectoriser_tfidf.fit(X_train)

print('Number of feature words:', len(vectoriser_tfidf.get_feature_names()))

Number of feature words: 24531




In [71]:
X_train_tfidf = vectoriser_tfidf.transform(X_train)
X_test_tfidf  = vectoriser_tfidf.transform(X_test)

In [30]:
# print(X_train_tfidf.toarray())

In [None]:
1199859*221591/1024/1024

#### CountVectorizer

In [72]:
vectorizer_vc = CountVectorizer()
vectorizer_vc.fit(X_train)

In [73]:
X_train_vc = vectorizer_vc.transform(X_train)
X_test_vc  = vectorizer_vc.transform(X_test)

# Classification Model

In [79]:
clf = RandomForestClassifier(random_state=42,n_jobs=-1)

In [80]:
clf.fit(X_train_tfidf,y_train)

In [81]:
y_pred = clf.predict(X_test_tfidf)

In [82]:
from sklearn.metrics import accuracy_score,classification_report

In [83]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.75      0.72      0.73      6253
           1       0.73      0.75      0.74      6247

    accuracy                           0.74     12500
   macro avg       0.74      0.74      0.74     12500
weighted avg       0.74      0.74      0.74     12500



In [86]:
# # TODO:
# DummyClassifier :https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html
# 
# 1- N (2,3)-Gram, try on samll data first
# 2- try on complete dataset, only if u r getting better rsult in step-1
# 3- Try word2vec, Glove (u need to download vector file called.. google-news-50d...)
# 4 - then try DT, RF, XgBoost , Naive Bayes
# 5 word cloud


# # MLP using keras

In [84]:
import gensim

In [85]:
gensim.models.word2vec.Word2Vec

gensim.models.word2vec.Word2Vec

In [None]:
clf.feature_importances_

In [None]:
vectoriser_tfidf.vocabulary_

In [None]:
fi = clf.feature_importances_

In [None]:
fn = vectoriser_tfidf.get_feature_names()

In [None]:
df = pd.DataFrame({'f_imp':fi,'f_name':fn})

In [None]:
df.sort_values('f_imp',ascending=False)

In [44]:
clf_xg = XGBClassifier()

In [45]:
clf_xg.fit(X_train_tfidf,y_train)

In [46]:
y_pred = clf_xg.predict(X_test_tfidf)

In [47]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.71      0.65      0.68       623
           1       0.68      0.74      0.71       627

    accuracy                           0.70      1250
   macro avg       0.70      0.70      0.69      1250
weighted avg       0.70      0.70      0.69      1250



In [None]:
# corpus = [
#     'This is the first document.',
#     'This document is the second document.',
#     'And this is the third one.',
#     'Is this the first document?',
# ]
# vectorizer = CountVectorizer() # ,TfidfVectorizer()
# X = vectorizer.fit_transform(corpus)

In [None]:
# tweet = tweet.apply(str)

In [None]:
# import time
# t = time.time()
# processed_text = text_process(clean_tweets)
# print(f'Text Preprocessing complete.')
# print(f'Time Taken: {round(time.time()-t)} seconds')

We create a new column for the clean Tweets but we will keep the old column as well in case we need it later on. 

We remove any short words that are less than 2 letters as they often happen to be meaningless and will not help our model. 

In [None]:
# Remove short words
#ta_df['Clean_Tweets'] = ta_df['Clean_Tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))


#### Stemming
We stem the words to make our text more standardize.


ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

tokenized_tweets = ta_df['clean_tweets'].apply(lambda x: stemming(x))
ta_df.head()

#### Tokenize
We tokenize the tweets so that we can make a use of our text. 

In [None]:
#Create a function to tenize the tweets

ta_df['tokenize_tweets'] = clean_tweets.apply(word_tokenize)
#ta_df['tokenize_tweets'].head()

In [None]:
ta_df.head()

##  Exploratory Data Analysis (EDA)

#### TF IDF Vector

TF-IDF is a way to measure statistically the relevant words in the document. It is effectively the product of frequency in which a word appears in a text and inverse document frequency (I.e. whether the word is rare or common in the text.)

In [None]:
tweets_concat = []
for tweet in ta_df['tokenize_tweets']:
   tweets_concat += tweet

In [None]:
tweets_freqdist = FreqDist(tweets_concat)
tweets_freqdist.most_common(200)

In [None]:
vectorizer = CountVectorizer()
docs       = vectorizer.fit_transform(tweets_concat)
features   = vectorizer.get_feature_names()

visualizer = FreqDistVisualizer(features=features, orient='v')
visualizer.fit(docs)
visualizer.show()

The top 10 are The follwoing: 
    
    1.sxsw
    2.mention
    3.link
    4.ipad
    4.apple
    5.google
    6.iphone
    7.quot
    8.store
    9.app
    10.new

***QUESTION - How can it be helpful for me to know this *** 

In [None]:
def red_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
    return tuple(Reds_9.colors[random.randint(6,8)])  # Function to help us generate wordcloud

* What are the most common words used in our data?

In [None]:
# We visualize our data with a word cloud 

all_words = ' '.join([text for text in ta_df['clean_tweets']])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

* What are the most common words in our data for negative and positive tweets?

####                                                                       Positive 

In [None]:
ta_df['sentiment']

In [None]:
# Extract all tweets into one long string with each word separate with a "space"
ta_df['clean_tweets'] = ta_df['clean_tweets'].apply(str)
ta_df['sentiment'] = ta_df['sentiment'].apply(str)
tweets_long_string = ta_df['sentiment'].tolist()
tweets_long_string = " ".join(tweets_long_string)


In [None]:
# Choosing sentiments that are positive and combining them

# po_sentiments = ta_df[ta_df['sentiment']=='positive'] 

# pt= " ".join([sentence for sentence in ta_df['clean_tweets'] =='positive'])

# # Choosing sentiments that are negative and combining them

# ne_sentiments = ta_df[ta_df['sentiment']=='negative']

# nt = " ".join([sentence for sentence in ta_df['clean_tweets'][ta_df['sentiment']=='negative']])

    * QUESTION WHY DOESNT WORK
    

font_path = "/Users/nataliaedelson/Desktop/OpenSans-CondBold.ttf"

In [None]:
# We build a function to generate word cloud
def plot_wordcloud(wordcloud):
    plt.figure(figsize=(30,30))
    plt.imshow(wordcloud)
    plt.axis("off")

In [None]:
# Add thumbs up 


In [None]:
# Import image to np.array
mask = np.array(Image.open('td.png'))
# Generate wordcloud
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='white', colormap='Reds', collocations=False, contour_color = 'white',contour_width=1, mask=mask).generate(nt)
# Plot
plot_wordcloud(wordcloud)

We would like to see if I can get more color on the word distribution and check what the output will be if we extract adjectives.

In [None]:
# We build a function to obtain adjectives from tweets
def getAdjectives(tweet):
    tweet = word_tokenize(tweet)  # convert string to tokens
    tweet = [word for (word, tag) in pos_tag(tweet)
             if tag == "JJ"]  # pos_tag module in NLTK librarybb
    return " ".join(tweet)  # join words with a space in between them

In [None]:
# Convert teh column in to strings 
ta_df['clean_tweets'] = ta_df['clean_tweets'].apply(str)

In [None]:
# Apply getAdgectives to our tweets
ta_df['tweets_adjectives'] = ta_df['clean_tweets'].apply(getAdjectives)

In [None]:
ta_df.head() # Check dataframe first 5 rows

In [None]:
# Extract all tweets into one long string with each word separate with a "space"
tweets_long_string = ta_df['tweets_adjectives'].tolist()
tweets_long_string = " ".join(tweets_long_string)

In [None]:
#Frequency of words in our data
#fdist = FreqDist(TA['Clean_Tweets'])
#WordCloud
wc = WordCloud(width=900, height=500, max_words=50).generate(tweets_long_string )
plt.figure(figsize=(12,10))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
total_vocab = set()
for comment in ta_df['clean_tweets']:
    total_vocab.update(comment)
len(total_vocab)

*** DOUBLE CHECK # There are 7,921 unique words

In [None]:
# Distribution of data focusing on sentiments and sentiment’s brand 

In [None]:
p = sns.countplot(data = ta_df, y = 'sentiment', palette="Set3") #Setting p to plot of Emotion
p.set(xlabel = 'count') #Labling X
p.set(ylabel = 'sentiment') #Labling Y
p.set(title = "Count of Tweets per Sentiment")

There are four more times positive tweets than negatives. There are barely any neutral tweets.


In [None]:
p = sns.countplot(data = ta_df, y = 'product_brand', palette="Set3") #Setting p to plot of Emotion
p.set(xlabel = 'Count') #Labling X
p.set(ylabel = 'Product_Brand') #Labling Y
p.set(title = "Count of Tweets per Sentiment")



Apple has a little more than double tweets 

In [None]:
ta_df.head()

In [None]:
display(ta_df.groupby(['sentiment'])['product_brand'].value_counts()) #Checking tweets rated by brand

display(ta_df.groupby(['product_brand'])['sentiment'].value_counts()) #Checking tweets rated by brand

In [None]:
plot = sns.countplot(data = ta_df, x = 'product_brand', hue = 'sentiment') #Setting p to plot of Brand and Emotion
plot.legend(title = 'sentiment', bbox_to_anchor = (1, 1), loc = 'upper left') #Creating legend for plot
plot.set(xlabel = 'Product_Brand') #Setting x label
plot.set(ylabel = 'Count') #Setting y label
plot.set(title = 'No. of Tweets per Brand by Sentiment'); #Setting title of plot

In [None]:
plot = sns.countplot(data = ta_df, x = 'sentiment', hue = 'product_brand',color='silver') #Setting p to plot of Emotion and Brand
plot.legend(title = 'Product_Brand', bbox_to_anchor = (1, 1), loc = 'upper left') #Creating legend for plot
plot.set(xlabel = 'Sentiment') #Setting x label
plot.set(ylabel = 'Count') #Setting y label
plot.set(title = 'No. of Tweets per sentiment by Brand'); #Setting title of plot

#https://www.sharpsightlabs.com/blog/seaborn-countplot/

### N-grams

In [None]:
stop 

In [None]:
#Converting to dummy variables 

sentiments_dummies = pd.get_dummies(TA["Sentiment"], prefix="Sentiment")
TA = pd.concat([TA, sentiments_dummies ], axis = 1)
TA.head()

In [None]:

X = ta_df["tokenize_tweets"]  
y = ta_df["sentiment"]
 

In [None]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
 

In [None]:
ta_df.head()


In [None]:
#Transforming data set

In [None]:
vectorizer =TfidfVectorizer(ngram_range=(1,2), max_features =40000)

In [None]:
#vectorizer.fit(X_train)

In [None]:
Print("Number of words:", len(vectorizer.getfeature_names()))

In [None]:
'''
    from sklearn.ensemble import RandomForestClassifier
    text_classifier = RandomForestClassifier(n_estimators=100, random_state=0)  
    text_classifier.fit(X_train, y_train)
    '''

In [None]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS
 
# Wordcloud with positive tweets
positive_tweets = df['tweet'][df["sentiment"] == 'Positive']
stop_words = ["https", "co", "RT"] + list(STOPWORDS)
positive_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(positive_tweets))
plt.figure()
plt.title("Positive Tweets - Wordcloud")
plt.imshow(positive_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
 
# Wordcloud with negative tweets
negative_tweets = df['tweet'][df["sentiment"] == 'Negative']
stop_words = ["https", "co", "RT"] + list(STOPWORDS)
negative_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(negative_tweets))
plt.figure()
plt.title("Negative Tweets - Wordcloud")
plt.imshow(negative_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [89]:
pip install dmba

Collecting dmba
  Downloading dmba-0.1.0-py3-none-any.whl (11.8 MB)
Installing collected packages: dmba
Successfully installed dmba-0.1.0
Note: you may need to restart the kernel to use updated packages.




In [90]:
from dmba import regressionSummary, classificationSummary, liftChart, gainsChart

In [91]:
regressionSummary([2,3,4,5],[1,2,3,4])


Regression statistics

                      Mean Error (ME) : 1.0000
       Root Mean Squared Error (RMSE) : 1.0000
            Mean Absolute Error (MAE) : 1.0000
          Mean Percentage Error (MPE) : 32.0833
Mean Absolute Percentage Error (MAPE) : 32.0833
