In [None]:
# Task 1 IMPORT LIBRARIES AND DATASETS

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False) 
# setting the style of the notebook to be monokai theme  
# ensures that  x and y axes are visible


In [None]:
# Load the data
tweets_df = pd.read_csv('twitter.csv')

In [None]:
tweets_df

In [None]:
tweets_df.info()

In [None]:
tweets_df.describe()

In [None]:
tweets_df['tweet']

In [None]:
# Drop the 'id' column
tweets_df = tweets_df.drop(['id'], axis=1)

In [None]:
#Task 2 PERFORM DATA EXPLORATION
sns.heatmap(tweets_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")

In [None]:
tweets_df.hist(bins = 30, figsize = (13,5), color = 'r')

In [None]:
sns.countplot(tweets_df['label'], label = "Count") 

In [None]:
sns.countplot(tweets_df['label'], label = "Count") 

In [None]:
tweets_df

In [None]:
tweets_df.describe()

In [None]:
# Let's see the shortest message 
tweets_df[tweets_df['length'] == 11]['tweet'].iloc[0]

In [None]:
# Task 3 PLOT THE WORDCLOUD
positive = tweets_df[tweets_df['label']==0]
positive

In [None]:
negative = tweets_df[tweets_df['label']==1]
negative

In [None]:
sentences = tweets_df['tweet'].tolist()
len(sentences)

In [None]:
sentences_as_one_string =" ".join(sentences)

In [None]:
sentences_as_one_string

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(sentences_as_one_string))

In [None]:
# Task 4 PERFORM DATA CLEANING - REMOVE PUNCTUATION FROM TEXT
import string
string.punctuation

In [None]:
Test = '$I love AI & Machine learning!!'
Test_punc_removed = [char for char in Test if char not in string.punctuation]
Test_punc_removed_join = ''.join(Test_punc_removed)
Test_punc_removed_join

In [None]:
Test = 'Good morning beautiful people :)... I am having fun learning Machine learning and AI!!'

In [None]:
Test_punc_removed = [char for char in Test if char not in string.punctuation]
Test_punc_removed

In [None]:
# Join the characters again to form the string.
Test_punc_removed_join = ''.join(Test_punc_removed)
Test_punc_removed_join

In [None]:
#Task 5 PERFORM DATA CLEANING - REMOVE STOPWORDS
import nltk # Natural Language tool kit 
nltk.download('stopwords')

# You have to download stopwords Package to execute this command
from nltk.corpus import stopwords
stopwords.words('english')


In [None]:
Test_punc_removed_join = 'I enjoy coding, programming and Artificial intelligence'
Test_punc_removed_join_clean = [word for word in Test_punc_removed_join.split() if word.lower() not in stopwords.words('english')]

In [None]:
Test_punc_removed_join_clean # Only important (no so common) words are left

In [None]:
Test_punc_removed_join

In [None]:
#Task 6 PERFORM COUNT VECTORIZATION (TOKENIZATION)
from sklearn.feature_extraction.text import CountVectorizer
sample_data = ['This is the first paper.','This document is the second paper.','And this is the third one.','Is this the first paper?']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sample_data)

In [None]:
print(vectorizer.get_feature_names())

In [None]:
print(X.toarray())  

In [None]:
#Task 7 CREATE A PIPELINE TO REMOVE PUNCTUATIONS, STOPWORDS AND PERFORM COUNT VECTORIZATION
def message_cleaning(message):
    Test_punc_removed = [char for char in message if char not in string.punctuation]
    Test_punc_removed_join = ''.join(Test_punc_removed)
    Test_punc_removed_join_clean = [word for word in Test_punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    return Test_punc_removed_join_clean

In [None]:
# Let's test the newly added function
tweets_df_clean = tweets_df['tweet'].apply(message_cleaning)

In [None]:
print(tweets_df_clean[5]) # show the cleaned up version

In [None]:
print(tweets_df['tweet'][5]) # show the original version

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Define the cleaning pipeline we defined earlier
vectorizer = CountVectorizer(analyzer = message_cleaning, dtype = np.uint8)
tweets_countvectorizer = vectorizer.fit_transform(tweets_df['tweet'])

In [None]:
print(vectorizer.get_feature_names())

In [None]:
print(tweets_countvectorizer.toarray())  

In [None]:
tweets_countvectorizer.shape

In [None]:
X = pd.DataFrame(tweets_countvectorizer.toarray())
X

In [None]:
y = tweets_df['label']

In [None]:
# Task 8 TRAIN AND EVALUATE A NAIVE BAYES CLASSIFIER MODEL
X.shape
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.naive_bayes import MultinomialNB

NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_predict_test))