In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import fnmatch
import string
from urllib.parse import urlparse
import contractions

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import words, stopwords
from nltk.metrics.distance import jaccard_distance, edit_distance
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer

from wordsegment import load, segment
from autocorrect  import Speller

#from deep_translator import GoogleTranslator

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/arjunkhanchandani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/arjunkhanchandani/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/arjunkhanchandani/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
tweets_df = pd.read_csv('/Users/arjunkhanchandani/Desktop/twitter_data_analysis-main/v2/data/tweets_v2.csv')
print(tweets_df.shape)
tweets_df.head()

(26015, 4)


Unnamed: 0.1,Unnamed: 0,date_created,tweet,city
0,0,2022-12-12 16:13:45+00:00,@esichq @byadavbjp @Rameswar_Teli @mygovindia ...,Mumbai
1,1,2022-12-10 06:30:56+00:00,@PotholeWarriors @CMOMaharashtra @mieknathshin...,Mumbai
2,2,2022-11-23 13:09:18+00:00,@Iam_Ayushmann Govandi is one of the Hotspot o...,Mumbai
3,3,2022-10-27 15:58:11+00:00,Till when medical negligence will exist in gov...,Mumbai
4,4,2022-07-28 03:03:15+00:00,Me being a doctor reading this\nAlso governmen...,Mumbai


In [3]:
tweets_df.rename(columns = {'Unnamed: 0':'tweet_id'}, inplace=True)
print(tweets_df.dtypes)

tweet_id         int64
date_created    object
tweet           object
city            object
dtype: object


In [4]:
#selecting only year from date_created column
tweets_df['date_created'] = pd.to_datetime(tweets_df['date_created'])
tweets_df['year'] = tweets_df['date_created'].dt.year
tweets_df.drop(['date_created'], axis=1, inplace=True)
tweets_df.head()

Unnamed: 0,tweet_id,tweet,city,year
0,0,@esichq @byadavbjp @Rameswar_Teli @mygovindia ...,Mumbai,2022
1,1,@PotholeWarriors @CMOMaharashtra @mieknathshin...,Mumbai,2022
2,2,@Iam_Ayushmann Govandi is one of the Hotspot o...,Mumbai,2022
3,3,Till when medical negligence will exist in gov...,Mumbai,2022
4,4,Me being a doctor reading this\nAlso governmen...,Mumbai,2022


In [5]:
print(tweets_df.city.value_counts())

Delhi        10005
Mumbai        6715
Hyderabad     3597
Bangalore     3229
Kolkata       1413
Chennai       1056
Name: city, dtype: int64


In [6]:
print(tweets_df.year.value_counts())

2020    8963
2021    7255
2019    3885
2022    3214
2018    2698
Name: year, dtype: int64


In [7]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26015 entries, 0 to 26014
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  26015 non-null  int64 
 1   tweet     26015 non-null  object
 2   city      26015 non-null  object
 3   year      26015 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 813.1+ KB


In [42]:
print('Shape of dataset before removal of duplicate tweets is {}'.format(tweets_df.shape))
tweets_no_dupl_df = tweets_df.drop_duplicates(subset=['tweet'])
print('Shape of dataset after removal of duplicate tweets is {}'.format(tweets_no_dupl_df.shape))

Shape of dataset before removal of duplicate tweets is (26015, 4)
Shape of dataset after removal of duplicate tweets is (19350, 4)


In [43]:
print(tweets_no_dupl_df.city.value_counts())

Delhi        7379
Mumbai       4963
Hyderabad    2614
Bangalore    2471
Kolkata      1077
Chennai       846
Name: city, dtype: int64


In [44]:
print(tweets_no_dupl_df.year.value_counts())

2020    6517
2021    5515
2019    2767
2022    2454
2018    2097
Name: year, dtype: int64


In [45]:
abbs_df = pd.read_csv('/Users/arjunkhanchandani/Desktop/twitter_data_analysis-main/v2/data/text_abbreviations_v2.csv')
# abbs_df.drop('Unnamed: 0', axis=1, inplace=True)
abbs_df.text = abbs_df.text.str.lower()

abbs_dict = zip(abbs_df.text, abbs_df.meaning)
abbs_dict = list(abbs_dict)
abbs_dict = dict(abbs_dict)

In [46]:
tweets_no_dupl_df

Unnamed: 0,tweet_id,tweet,city,year
0,0,@esichq @byadavbjp @Rameswar_Teli @mygovindia ...,Mumbai,2022
1,1,@PotholeWarriors @CMOMaharashtra @mieknathshin...,Mumbai,2022
2,2,@Iam_Ayushmann Govandi is one of the Hotspot o...,Mumbai,2022
3,3,Till when medical negligence will exist in gov...,Mumbai,2022
4,4,Me being a doctor reading this\nAlso governmen...,Mumbai,2022
...,...,...,...,...
25845,25845,@KTRTRS Sir please take action on supraja hosp...,Hyderabad,2021
25859,25859,* Why meme police didn't waited for his PM rep...,Hyderabad,2020
25918,25918,@NewBolarum @amksocialwork @TOIHyderabad @Tela...,Hyderabad,2020
25962,25962,@narendramodi @PMOIndia \nHonorable PM we are ...,Hyderabad,2021


# Cleaning Tweets


In [47]:
#extracting hashtags
tweets_no_dupl_df['hashtags'] = tweets_no_dupl_df['tweet'].apply(lambda x: re.findall(r'\B#\w*[a-zA-Z]+\w*', x)) #creating a new column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_no_dupl_df['hashtags'] = tweets_no_dupl_df['tweet'].apply(lambda x: re.findall(r'\B#\w*[a-zA-Z]+\w*', x)) #creating a new column


In [48]:
tweets_no_dupl_df

Unnamed: 0,tweet_id,tweet,city,year,hashtags
0,0,@esichq @byadavbjp @Rameswar_Teli @mygovindia ...,Mumbai,2022,[]
1,1,@PotholeWarriors @CMOMaharashtra @mieknathshin...,Mumbai,2022,[]
2,2,@Iam_Ayushmann Govandi is one of the Hotspot o...,Mumbai,2022,[#Measles]
3,3,Till when medical negligence will exist in gov...,Mumbai,2022,[]
4,4,Me being a doctor reading this\nAlso governmen...,Mumbai,2022,[]
...,...,...,...,...,...
25845,25845,@KTRTRS Sir please take action on supraja hosp...,Hyderabad,2021,[]
25859,25859,* Why meme police didn't waited for his PM rep...,Hyderabad,2020,[#IndiaFightsBack4SSR]
25918,25918,@NewBolarum @amksocialwork @TOIHyderabad @Tela...,Hyderabad,2020,[]
25962,25962,@narendramodi @PMOIndia \nHonorable PM we are ...,Hyderabad,2021,[]


In [52]:
#removing mentions
def remove_mentions(tweet):
    tweet = ' '.join(re.sub(r"@\w+"," ", tweet).split())
    return tweet

tweets_no_dupl_df['tweet'] = tweets_no_dupl_df['tweet'].apply(lambda x: remove_mentions(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_no_dupl_df['tweet'] = tweets_no_dupl_df['tweet'].apply(lambda x: remove_mentions_hashtags(x))


In [55]:
tweets_no_dupl_df['tweet']

0        It is very bad thing to say that government di...
1        all netas and their families should be admitte...
2        Govandi is one of the Hotspot of #Measles as w...
3        Till when medical negligence will exist in gov...
4        Me being a doctor reading this Also government...
                               ...                        
25845    Sir please take action on supraja hospital nag...
25859    * Why meme police didn't waited for his PM rep...
25918    Such a good facility with 30 bedded inpatient ...
25962    Honorable PM we are aware Private Hospitals ar...
25975    Plz look into the deaths of infants &amp; new ...
Name: tweet, Length: 19350, dtype: object

In [17]:
#segmenting words
def segment_words(tweet):
    load()
    tweet = ' '.join(segment(tweet))
    return tweet

In [18]:
tweets_no_dupl_df['tweet'] = tweets_no_dupl_df['tweet'].apply(lambda x: segment_words(x))

: 

: 

In [None]:
# def autocorrect(tweet):
#     spell = Speller(lang='en')
#     tweet = ' '.join([spell(w) for w in tweet.split()])
#     return tweet

In [None]:
#tweets_no_dupl_df['tweet'] = tweets_no_dupl_df['tweet'].apply(lambda x: autocorrect(x))

In [None]:
#removing links
def removing_links(df):
     for tweets in df.loc[:,'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
        
          tokens = tokenizer.tokenize(tweets)
          x = [word for word in tokens if not urlparse(word).scheme]
          tweets = ' '.join(x)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

tweets_no_dupl_df = removing_links(tweets_no_dupl_df)

In [None]:
tweets_no_dupl_df['tweet'] = tweets_no_dupl_df['tweet'] .str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')

In [None]:
tweets_no_dupl_df

In [None]:
processed_text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweets_no_dupl_df['tweet'] )
tweets_no_dupl_df['tweet']  = " ".join(processed_text.split())

In [None]:
tweets_no_dupl_df

In [None]:
def replacing_abbr(df, dictry):
    for tweets in df.loc[:,'tweet']:
          
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          tweets = tweets.split()
          tweets = [dictry[word] if word in dictry else word for word in tweets]
          tweets = ' '.join(tweets)
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
    return df

def contractions_handling(df):
     for tweets in df.loc[:,'tweet']:
          
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          tweets = contractions.fix(tweets)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def adding_space_bw_words_punc(df):
     for tweets in df.loc[:, 'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          tweets = tweets.replace(',', ' , ').replace('.', ' . ').replace('?', ' ? ').replace('!', ' ! ').replace('-', ' - ').replace('(', ' ( ').replace(')', ' ) ').replace(':', ' : ').replace(';', ' ; ').replace('"', ' " ').replace("'", " ' ").replace('  ', ' ')
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
     return df

def removing_hashtags_mentions(df):
     for tweets in df.loc[:, 'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tokens = tokenizer.tokenize(tweets)
          tokens = [word for word in tokens if word[0] not in ('#', '@')]
          tokens = [word for word in tokens if word[0] not in ('▪')]
          tweets = ' '.join(tokens)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
     return df
          
def removing_punctuations_emojis(df):
     for tweets in df.loc[:, 'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tweets = tweets.translate(str.maketrans(' ', ' ', string.punctuation)) # removes punctuations
          tweets = re.sub(r'[^\x00-\x7F]+', ' ', tweets) # removes emojis
          tweets = tweets.lower() # converts text to lower case
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
     return df

def removing_numbers(df):
     for tweets in df.loc[:, 'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          #remove numbers
          tweets = re.sub(r'\d+', '', tweets)
          tweets = re.sub(' +', ' ', tweets)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
     return df

def removing_stopwords(df):
     for tweets in df.loc[:,'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tokens = tokenizer.tokenize(tweets)
          filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
          tweets = ' '.join(filtered_tokens)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def lemmatization(df):
     for tweets in df.loc[:,'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tokens = tokenizer.tokenize(tweets)
          lemmatizer = WordNetLemmatizer()
          lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
          tweets = ' '.join(lemmatized_tokens)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def removing_characters_less_3(df):
     for tweets in df.loc[:,'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tokens = tokenizer.tokenize(tweets)
          filtered_tokens = [word for word in tokens if len(word) > 2]
          tweets = ' '.join(filtered_tokens)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def removing_tweets_less_5(df):
     for tweets in df.loc[:, 'tweet']:
          
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          if len(tweets.split()) < 5:
               df.drop(df[df['tweet_id']==tweet_id].index, inplace=True)
     return df

def removing_duplicates(df):
     df = df.drop_duplicates(subset=['tweet'])
     
     for i in range(len(df)-1):
          try:
               if df['tweet'].iloc[i].split()[:5] == df['tweet'].iloc[i+1].split()[:5]:
                    df.drop(df[df['tweet_id']==df['tweet_id'].iloc[i+1]].index, inplace=True)
                    i -= 1
               else:
                    i += 1
          except:
               continue
               
     return df


def data_preprocessing(df, dictry):
     
     df = removing_links(df)
     df = replacing_abbr(df, dictry)
     df = contractions_handling(df)
     df = adding_space_bw_words_punc(df)
     df = removing_hashtags_mentions(df)
     df = removing_punctuations_emojis(df)
     df = removing_numbers(df)
     df = removing_stopwords(df)
     df = lemmatization(df)
     df = removing_characters_less_3(df)
     df = removing_tweets_less_5(df)
     df = removing_duplicates(df)
     df.reset_index(drop=True, inplace=True)
     
     return df

In [None]:
tweets_cleaned_df = data_preprocessing(tweets_no_dupl_df, abbs_dict)
print(tweets_cleaned_df.shape)
tweets_cleaned_df.head()

In [None]:
# creating csv of cleaned dataset
pd.DataFrame.to_csv(tweets_cleaned_df, '/Users/arjunkhanchandani/Desktop/twitter_data_analysis-main/v2/data/tweets_cleaned_v2.csv', index=False)

# Handing Hinglish


In [None]:
# def hinglish_to_english(df, lang):
#     translated_val = list()
#     count = 0
#     # translating the text to english using GoogleTranslator API
#     for x in df['tweet']:
#         count += 1
#         try:
#             # GoogleTranslator API has a limit of 5000 characters per request, so splitting the text into chunks of 5000 characters and then translating it
#             if len(str(x))<5000:
                
#                 translation = GoogleTranslator(source=lang, target='en').translate(x)
#                 translated_val.append(translation)
            
#             elif len(str(x))>5000 and len(str(x))<10000:
                
#                 split_x = [x[i:i+4999] for i in range(0, len(x), 4999)]
#                 translation_1 = GoogleTranslator(source=lang, target='en').translate(split_x[0])
#                 translation_2 = GoogleTranslator(source=lang, target='en').translate(split_x[1])
#                 translation = translation_1 + translation_2
#                 translated_val.append(translation)
                
#         except Exception as e:
#             # if the text is not in the language provided, then it will return a nan value or text length is more than 15000 characters
#             translated_val.append(np.nan)
#         if count%1000==0:
#             print(count)
    
#     # replacing the original text with the translated text
#     df['tweet_translated'] = translated_val
    
#     # returning the updated dataframe
#     return df
            

In [None]:
# translated_df = hinglish_to_english(tweets_cleaned_df, 'hi')
# print(translated_df.isna().sum())
# translated_df.dropna(inplace=True)
# print(translated_df.isna().sum())
# translated_df.head()

In [None]:
# pd.DataFrame.to_csv(translated_df, '/Users/nitanshjain/Documents/Projects/Twitter_Data_Analysis/v2/data/tweets_cleaned_translated_v2.csv', index=False)