<a href="https://colab.research.google.com/github/abhilashhn1993/Sentiment_Analysis_of_Tweets/blob/master/Tweets_PreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install spacy
!pip install pyLDAvis

In [0]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
import nltk; 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [0]:
#remove been, being, haven't, don't, couldn't, didn't
stop_words.remove("don't")
stop_words.remove("been")
stop_words.remove("being")
stop_words.remove("haven't")
stop_words.remove("couldn't")
stop_words.remove("didn't")

In [0]:
#Adding Extra Stopwords to be removed from the dataset
stop_words.extend(['http', 'https', 'twitter', 'www', 'instagram', 'zsgdbw', 'tmblr', 'co', 'twitch','facebook', 'snapchat'])
stop_words.extend(['make','really','see','go','would','even','get','com','be','year','still','do','know','actually','much', 'let','s','have','name','photo'])
stop_words.extend(['back','today','day','last','thing','kit','stream','amp'])

#some custom keywords frequently seen in the dataset
stop_words.extend(['trinawolfy','trinawolfy','riyenrootsmusic'])
len(stop_words)

**Initialize all the custom methods to pre-process the text data**

In [0]:
import string
#METHODS FOR TEXT CLEANING#

#text punctuation removal
def remove_punctuation(text):
  for p in string.punctuation:
    text = text.replace(p,'')
  return text

#method for pre-processing
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

# Method for stopwords removal
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

#Method for text Lemmatization with POS tagging
nlp = spacy.load('en', disable=['parser', 'ner'])
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

#Method to remove texts with consecutive characters
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

#Method to convert the cleaned tweets back into string
def convert_to_string(df):
  for row in range(len(df)):
    df.iloc[row].Tweets = ' '.join([str(element) for element in df.iloc[row].Tweets])
  return df

**Clean Tweets** method to pre-process the data. 

This method **removes punctuations, emails, URLs, stop words and lemmatizes the Tweets** in the dataset

In [0]:
def cleanTweets(df):
	# Convert to list
	df['Tweets'] = df.Tweets.values.tolist()

	# Remove Emails
	df['Tweets'] = [re.sub('\S*@\S*\s?', '', sent) for sent in df['Tweets']]
	# Remove new line characters
	df['Tweets'] = [re.sub('\s+', ' ', sent) for sent in df['Tweets']]
	# Remove Punctuations
	df['Tweets'] = df.Tweets.apply(remove_punctuation)
	# Remove distracting single quotes
	df['Tweets'] = [re.sub("\'", "", sent) for sent in df['Tweets']]
	# Remove consecutive characters
	df['Tweets'] = np.vectorize(remove_pattern)(df['Tweets'], "@[\w]*")

	df['Tweets'] = list(sent_to_words(df['Tweets']))
	df['Tweets'] = remove_stopwords(df['Tweets'])

	# Initialize spacy 'en' model
	df['Tweets'] = lemmatization(df['Tweets'], allowed_postags=['NOUN','ADJ','VERB','ADV'])

	# remove the stopwords again after lemmatizing the text
	df['Tweets'] = remove_stopwords(df['Tweets'])

	df = convert_to_string(df)
	df = df.drop([0], axis=0)
	return df

**Read the dataset**

In [9]:
from google.colab import files
uploaded = files.upload()

Saving All_User_Tweets.csv to All_User_Tweets.csv


In [0]:
import io
df_post = pd.read_csv(io.BytesIO(uploaded['tweets_PostDiagnosis.csv']))
df_pre = pd.read_csv(io.BytesIO(uploaded['tweets_PreDiagnosis.csv']))
df_all = pd.read_csv(io.BytesIO(uploaded['All_User_Tweets.csv']))

In [28]:
df_all = df_all.rename(columns={'No. of Tweets': 'TweetCount'})
df_all.head()

Unnamed: 0,Username,Tweets,Day,Month,Date,Time,Year,Ptsd,Gender,Following,Followers,TweetCount
0,CropxDust,Ask teeps and John legend how many times i’ve ...,Mon,Jun,11.0,5:01:15 PM,2018.0,1,M,432,660,12200
1,CropxDust,"International house of better ingredients, bet...",Mon,Jun,11.0,1:26:14 PM,2018.0,1,M,432,660,12200
2,CropxDust,Ronda Rousey shoulda been in Skyrim,Sun,Jun,10.0,2:46:04 AM,2018.0,1,M,432,660,12200
3,CropxDust,I challenge you to come punch me in the face y...,Fri,Jun,8.0,7:07:52 PM,2018.0,1,M,432,660,12200
4,CropxDust,Come over and make catfish brother,Fri,Jun,8.0,2:18:26 AM,2018.0,1,M,432,660,12200


In [0]:
#Concatenate by username and Ptsd flag variable for the All PTSD dataset
df = df_all.groupby(['Username','Ptsd','Gender','Following','Followers','TweetCount'])['Tweets'].agg(' '.join).reset_index(name='Tweets')

In [0]:
df = cleanTweets(df)

In [31]:
#Just in case pandas encounters the SettingWithCopyWarning 
df['Tweets'] = df['Tweets'].str.join(" ")
df.head()

Unnamed: 0,Username,Ptsd,Gender,Following,Followers,TweetCount,Tweets
1,AlishaaHasan,0,F,309,264,3460,first ever happen leave cold sometimes dream t...
2,AmyRollitt,1,F,1293,459,3758,come usual shame miss opportunity least lovely...
3,BenBarbossa,1,M,2886,2858,8107,maybe maybe lovely wife call smart ass part ne...
4,BethLynch2020,1,F,2524,47500,156800,definitely better claim child guess medium deb...
5,BombayDelhiGirl,0,F,1584,2746,8218,corbyn sell idea glorify mythical past never e...


Cleaning the pre and post diagnosis tweets datasets

In [0]:
df_pre = cleanTweets(df_pre)
df_post = cleanTweets(df_post)

In [0]:
df_pre.head()

In [0]:
df_post.head()

In [0]:
df_pre.to_csv('cleanedPreTweets.csv')
df_post.to_csv('cleanedPostTweets.csv')
df.to_csv('cleanedAllPTSDTweets.csv')

In [0]:
files.download('cleanedPreTweets.csv')
files.download('cleanedAllPTSDTweets.csv')
files.download('cleanedPostTweets.csv')