### Importation of the necessary libraries

In [None]:
import nltk
import re

#Data Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

#Data Preprocessing and Feature Engineering
from textblob import TextBlob

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Importation and initial exploration of the data before cleaning

In [None]:
# Import the data set
df = pd.read_csv('/content/train.csv')
df

FileNotFoundError: ignored

In [None]:
# Observe the structure
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.sentiment.unique() # checking the unique sentiments present

In [None]:
df.tweetid.nunique() # are the tweets from different people

In [None]:
df.columns

In [None]:
df.message.head()

In [None]:
# Getting the details of some of the messages in the first 10 messages
for i in range(1,11):
    
    print('message number ',i)
    print('===================================================================================')
    print(df.message[i])
    print(" ")


### From the above we can see that some cleaning needs to be done 
1. Urls
2. Hashtags
3. stopwords
4. punctuations
5. words with @

#### suggestion We can create a function to deal with the cleaning of the data

### Text Cleaning
* Removing Noise
* Tokenisation
* Stemming
* Lemmatisation
* Stop Words

In [None]:
nltk.download()

In [None]:
# Let us see how many sentiments are present for each time of sentiment
df.sentiment.value_counts().plot(kind = 'bar')
plt.show()

## Defining useful functions for cleanig of the data

In [None]:
import string
print(list(string.punctuation))

In [None]:
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
tokeniser = TreebankWordTokenizer()


In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def remove_urls(text):
    text = re.sub(r"\S*https?:\S*", "", text, flags=re.MULTILINE)
    return text

In [None]:
def form_sentence(tweet):
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

In [None]:
def no_user_alpha(tweet):
    tweet_list = [ele for ele in tweet.split() if ele != 'user']
    clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
    return clean_mess

In [None]:
#def normalization(tweet_list):
#  lem = WordNetLemmatizer()
 # normalized_tweet = []
 # for word in tweet_list:
 # normalized_text = lem.lemmatize(word,'v')
  #normalized_tweet.append(normalized_text)
 # return normalized_tweet

In [None]:
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer

In [None]:
def remove_punctuation(post):
    pure_words= []
    clean_sentence = ''.join([l for l in post if l not in list(string.punctuation)]).split()
    for word in clean_sentence:
      word ="".join(c for c in word if (c.isalpha() or c==" "))
    
      pure_words.append(word)


    return ' '.join([str(item) for item in pure_words])

In [None]:
stemmer = SnowballStemmer('english') # This function will be used to create stems
def mbti_stemmer(words, stemmer):
    return [stemmer.stem(word) for word in words]

In [None]:
def mbti_lemma(words, lemmatizer): # This function will be used to lemmatize
    return [lemmatizer.lemmatize(word) for word in words] 

In [None]:
# removing words starting with @ and also avoiding combination of words
def clean_text(X):

  X = X.split()
  X_new = [x for x in X if not x.startswith("@")] # removing words starting with @
  X_new = [x for x in X_new if len(x) < 9]        # removing words longer than 8
  X_new = [x for x in X_new if x != 'rt']         # removing str rt from text, it appears too many times

  return ' '.join(X_new)

In [None]:
def wrangle(df,post_column):
  """ the function takes in two arguments a dataframe and a name column
  for cleaning. It create three new columns a)tokens  b)stem c) lemma
  It then returns the wrangled dataframe. This function will be used
  on both the train and test data to clean and preprocess both """
  # remove punctuations
  df[post_column] = df[post_column].apply(remove_punctuation)

  #lowercase
  df[post_column] = df[post_column].str.lower()

  # remove urls
  df[post_column] = df[post_column].apply(remove_urls)

  # remove words starting with @ from messages
  df[post_column] = df[post_column].apply(clean_text)
  
  # tokenize
  df['tokens'] = df[post_column].apply(tokeniser.tokenize)

  #  stematize
  df['stem'] = df['tokens'].apply(mbti_stemmer, args=(stemmer, ))

  # lemmatize
  df['lemma'] = df['tokens'].apply(mbti_lemma, args=(lemmatizer, ))

  return df

In [None]:
wrangle(df,'message')

In [None]:
df.iloc[9]

#### lets view the first ten lines again after cleaning

In [None]:
# Getting the details of some of the messages in the first 10 messages
for i in range(1,11):
    
    print('message number ',i)
    print('===================================================================================')
    print(df.message[i])
    print(" ")


## Exploratory Data Analysis