In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import fnmatch
import string
from urllib.parse import urlparse
import contractions

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import words, stopwords
from nltk.metrics.distance import jaccard_distance, edit_distance
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer

from deep_translator import GoogleTranslator

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nitanshjain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nitanshjain/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/nitanshjain/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
tweets_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/Twitter_Data_Analysis/v2/data/tweets_v2.csv')
print(tweets_df.shape)
tweets_df.head()

(26015, 4)


Unnamed: 0.1,Unnamed: 0,date_created,tweet,city
0,0,2022-12-12 16:13:45+00:00,@esichq @byadavbjp @Rameswar_Teli @mygovindia ...,Mumbai
1,1,2022-12-10 06:30:56+00:00,@PotholeWarriors @CMOMaharashtra @mieknathshin...,Mumbai
2,2,2022-11-23 13:09:18+00:00,@Iam_Ayushmann Govandi is one of the Hotspot o...,Mumbai
3,3,2022-10-27 15:58:11+00:00,Till when medical negligence will exist in gov...,Mumbai
4,4,2022-07-28 03:03:15+00:00,Me being a doctor reading this\nAlso governmen...,Mumbai


In [3]:
tweets_df.rename(columns = {'Unnamed: 0':'tweet_id'}, inplace=True)
print(tweets_df.dtypes)

tweet_id         int64
date_created    object
tweet           object
city            object
dtype: object


In [4]:
#selecting only year from date_created column
tweets_df['date_created'] = pd.to_datetime(tweets_df['date_created'])
tweets_df['year'] = tweets_df['date_created'].dt.year
tweets_df.drop(['date_created'], axis=1, inplace=True)
tweets_df.head()

Unnamed: 0,tweet_id,tweet,city,year
0,0,@esichq @byadavbjp @Rameswar_Teli @mygovindia ...,Mumbai,2022
1,1,@PotholeWarriors @CMOMaharashtra @mieknathshin...,Mumbai,2022
2,2,@Iam_Ayushmann Govandi is one of the Hotspot o...,Mumbai,2022
3,3,Till when medical negligence will exist in gov...,Mumbai,2022
4,4,Me being a doctor reading this\nAlso governmen...,Mumbai,2022


In [5]:
print(tweets_df.city.value_counts())
print(tweets_df.year.value_counts())

Delhi        10005
Mumbai        6715
Hyderabad     3597
Bangalore     3229
Kolkata       1413
Chennai       1056
Name: city, dtype: int64
2020    8963
2021    7255
2019    3885
2022    3214
2018    2698
Name: year, dtype: int64


In [6]:
print('Shape of dataset before removal of duplicates is {}'.format(tweets_df.shape))
tweets_no_dupl_df = tweets_df.drop_duplicates(subset=['tweet'])
print('Shape of dataset after removal of duplicates is {}'.format(tweets_no_dupl_df.shape))

Shape of dataset before removal of duplicates is (26015, 4)
Shape of dataset after removal of duplicates is (19350, 4)


In [7]:
print(tweets_no_dupl_df.city.value_counts())
print(tweets_no_dupl_df.year.value_counts())

Delhi        7379
Mumbai       4963
Hyderabad    2614
Bangalore    2471
Kolkata      1077
Chennai       846
Name: city, dtype: int64
2020    6517
2021    5515
2019    2767
2022    2454
2018    2097
Name: year, dtype: int64


In [12]:

def removing_links(df):
     for tweets in df.loc[:,'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
        
          tokens = tokenizer.tokenize(tweets)
          x = [word for word in tokens if not urlparse(word).scheme]
          tweets = ' '.join(x)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def contractions_handling(df):
     for tweets in df.loc[:,'tweet']:
          
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          tweets = contractions.fix(tweets)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def adding_space_bw_words_punc(df):
     for tweets in df.loc[:, 'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          tweets = tweets.replace(',', ' , ').replace('.', ' . ').replace('?', ' ? ').replace('!', ' ! ')
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
     return df

def removing_hashtags_mentions(df):
     for tweets in df.loc[:, 'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tokens = tokenizer.tokenize(tweets)
          tokens = [word for word in tokens if word[0] not in ('#', '@')]
          tokens = [word for word in tokens if word[0] not in ('▪')]
          tweets = ' '.join(tokens)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
     return df
          
def removing_punctuations_emojis(df):
     for tweets in df.loc[:, 'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tweets = tweets.translate(str.maketrans('', '', string.punctuation)) # removes punctuations
          tweets = re.sub(r'[^\x00-\x7F]+', ' ', tweets) # removes emojis
          tweets = tweets.lower() # converts text to lower case
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
     return df

def removing_numbers(df):
     for tweets in df.loc[:, 'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          #remove numbers
          tweets = re.sub(r'\d+', '', tweets)
          tweets = re.sub(' +', ' ', tweets)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
     return df

def removing_stopwords(df):
     for tweets in df.loc[:,'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tokens = tokenizer.tokenize(tweets)
          filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
          tweets = ' '.join(filtered_tokens)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def lemmatization(df):
     for tweets in df.loc[:,'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tokens = tokenizer.tokenize(tweets)
          lemmatizer = WordNetLemmatizer()
          lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
          tweets = ' '.join(lemmatized_tokens)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def removing_single_characters(df):
     for tweets in df.loc[:,'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tokens = tokenizer.tokenize(tweets)
          filtered_tokens = [word for word in tokens if len(word) > 1]
          tweets = ' '.join(filtered_tokens)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def removing_tweets_less_5(df):
     for tweets in df.loc[:, 'tweet']:
          
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          if len(tweets.split()) < 5:
               df.drop(df[df['tweet_id']==tweet_id].index, inplace=True)
     return df

def removing_duplicates(df):
     df = df.drop_duplicates(subset=['tweet'])
     return df

def data_preprocessing(df):
     
     df = removing_links(df)
     df = contractions_handling(df)
     # df = adding_space_bw_words_punc(df)
     df = removing_hashtags_mentions(df)
     # df = removing_punctuations_emojis(df)
     df = removing_numbers(df)
     df = lemmatization(df)
     df = removing_stopwords(df)
     df = removing_single_characters(df)
     df = removing_tweets_less_5(df)
     df = removing_duplicates(df)
     df.reset_index(drop=True, inplace=True)
     
     return df

In [13]:
tweets_cleaned_df = data_preprocessing(tweets_no_dupl_df)
print(tweets_cleaned_df.shape)
tweets_cleaned_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['tweet_id']==tweet_id].index, inplace=True)


(18184, 4)


Unnamed: 0,tweet_id,tweet,city,year
0,0,It bad thing say government medical spare oper...,Mumbai,2022
1,1,netas family admitted government hospital Priv...,Mumbai,2022
2,2,Govandi one Hotspot well Respiratory Diseases ...,Mumbai,2022
3,3,Till medical negligence exist government hospi...,Mumbai,2022
4,4,Me doctor reading Also government hospital res...,Mumbai,2022


In [27]:
def get_abbreviations(df):
    count = 0
    abb = list()
    for tweets in df.loc[:, 'tweet']:
        count+=1
        # finding abbreviations in the tweets
        x = re.findall(r"\b[A-Z\.]{2,}s?\b", tweets)
        if len(x) > 0:
            # print(x)
            for i in x:
                if len(i)<5:
                    abb.append(i)
    print(set(abb))

get_abbreviations(tweets_cleaned_df)


{'JMI', 'PO', 'WEA', 'NCI', 'AFMS', 'NZM', 'GHK', 'SAST', 'TY', 'AMU', 'BU', 'DAD', 'IGST', 'DMC', 'ACPD', 'RMO', 'MRC', 'BMJH', 'AT', 'CHS', 'IP', 'IN', 'SBC', 'GOK', 'KAR', 'SAW', 'GOVT', 'JPR', 'NAVY', 'EAST', 'TU', 'DFS', 'BS', 'SP', 'REG', 'CP', 'DIV', 'R.', 'MA', 'GALI', 'SAR', 'SGRH', 'HCG', 'NSUI', 'CGST', 'MY', 'MET', 'UPA', 'COP', 'WHAT', 'CMC', 'RSS', 'IS', 'PMS', 'USD', 'HBA', 'THR', 'PPH', 'CMRI', 'DIYA', 'WRT', 'UAPA', 'LPG', 'GPRA', 'IIMC', 'NOR', 'DOCS', 'GTR', 'UTs', 'CAR', 'NLUs', 'LWP', 'ADMK', 'SAHA', 'IUC', 'JAI', 'KDMC', 'AMHO', 'OT', 'CD', 'J.', 'USG', 'RAY', 'CK', 'CTP', 'GALL', 'QC', 'JNU', 'KPC', 'ESRD', 'LN', 'BDD', 'MAD', 'CAME', 'SK', 'RT', 'AGE', 'YAG', 'MRs', 'ELSE', 'SAIL', 'SAAS', 'IITM', 'DTCs', 'SION', 'KPIs', 'A.', 'AIMS', 'ZONE', 'MDL', 'THQ', 'NGEF', 'TTT', 'ORS', 'UAE', 'PPP', 'HOW', 'PAC', 'AMH', 'ET', 'CELL', 'BASE', 'BANK', 'AMID', 'RGI', 'HV', 'V.C.', 'MHRD', 'SER', 'MEN', 'B.S.', 'POOL', 'JCB', 'UPS', 'SUN', 'DA', 'EHS', 'CNG', 'PLZZ', 'TMA',

In [10]:
def hinglish_to_english(df, lang):
    translated_val = list()
    count = 0
    # translating the text to english using GoogleTranslator API
    for x in df['tweet']:
        count += 1
        try:
            # GoogleTranslator API has a limit of 5000 characters per request, so splitting the text into chunks of 5000 characters and then translating it
            if len(str(x))<5000:
                
                translation = GoogleTranslator(source=lang, target='en').translate(x)
                translated_val.append(translation)
            
            elif len(str(x))>5000 and len(str(x))<10000:
                
                split_x = [x[i:i+4999] for i in range(0, len(x), 4999)]
                translation_1 = GoogleTranslator(source=lang, target='en').translate(split_x[0])
                translation_2 = GoogleTranslator(source=lang, target='en').translate(split_x[1])
                translation = translation_1 + translation_2
                translated_val.append(translation)
                
        except Exception as e:
            # if the text is not in the language provided, then it will return a nan value or text length is more than 15000 characters
            translated_val.append(np.nan)
        if count%1000==0:
            print(count)
    
    # replacing the original text with the translated text
    df['tweet_translated'] = translated_val
    
    # returning the updated dataframe
    return df
            

In [11]:
translated_df = hinglish_to_english(tweets_cleaned_df, 'hi')
translated_df.head()

1000
2000
3000
4000
5000


In [None]:
pd.DataFrame.to_csv(translated_df, '/Users/nitanshjain/Documents/Projects/Twitter_Data_Analysis/v2/data/tweets_translated_v2.csv', index=False)

In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', min_df=10, ngram_range=(1,1))
data_clean_cv = cv.fit_transform(translated_df.tweet)
# data_clean_cv.toarray()
data_clean_dtm = pd.DataFrame(data_clean_cv.toarray(), columns=cv.get_feature_names_out())
data_clean_dtm.index = translated_df.index
data_clean_dtm

In [None]:
# creating csv of cleaned dataset
pd.DataFrame.to_csv(tweets_cleaned_df, '/Users/nitanshjain/Documents/Projects/Twitter_Data_Analysis/v2/data/tweets_cleaned_with_stopwords_v2.csv', index=False)