In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import fnmatch
import string
from urllib.parse import urlparse
import contractions

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import words, stopwords
from nltk.metrics.distance import jaccard_distance, edit_distance
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer

from deep_translator import GoogleTranslator

In [None]:
tweets_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/Twitter_Data_Analysis/v2/data/tweets_v2.csv')
print(tweets_df.shape)
tweets_df.head()

In [None]:
tweets_df.rename(columns = {'Unnamed: 0':'tweet_id'}, inplace=True)
print(tweets_df.dtypes)

In [None]:
#selecting only year from date_created column
tweets_df['date_created'] = pd.to_datetime(tweets_df['date_created'])
tweets_df['year'] = tweets_df['date_created'].dt.year
tweets_df.drop(['date_created'], axis=1, inplace=True)
tweets_df.head()

In [None]:
print(tweets_df.city.value_counts())
print(tweets_df.year.value_counts())

In [None]:
print('Shape of dataset before removal of duplicates is {}'.format(tweets_df.shape))
tweets_no_dupl_df = tweets_df.drop_duplicates(subset=['tweet'])
print('Shape of dataset after removal of duplicates is {}'.format(tweets_no_dupl_df.shape))

In [None]:
print(tweets_no_dupl_df.city.value_counts())
print(tweets_no_dupl_df.year.value_counts())

In [None]:

def removing_links(df):
     for tweets in df.loc[:,'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
        
          tokens = tokenizer.tokenize(tweets)
          x = [word for word in tokens if not urlparse(word).scheme]
          tweets = ' '.join(x)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def contractions_handling(df):
     for tweets in df.loc[:,'tweet']:
          
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          tweets = contractions.fix(tweets)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def adding_space_bw_words_punc(df):
     for tweets in df.loc[:, 'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          tweets = tweets.replace(',', ' , ').replace('.', ' . ').replace('?', ' ? ').replace('!', ' ! ')
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
     return df

def removing_hashtags_mentions(df):
     for tweets in df.loc[:, 'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tokens = tokenizer.tokenize(tweets)
          tokens = [word for word in tokens if word[0] not in ('#', '@')]
          tokens = [word for word in tokens if word[0] not in ('▪')]
          tweets = ' '.join(tokens)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
     return df
          
def removing_punctuations_emojis(df):
     for tweets in df.loc[:, 'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tweets = tweets.translate(str.maketrans('', '', string.punctuation)) # removes punctuations
          tweets = re.sub(r'[^\x00-\x7F]+', ' ', tweets) # removes emojis
          tweets = tweets.lower() # converts text to lower case
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
     return df

def removing_numbers(df):
     for tweets in df.loc[:, 'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          #remove numbers
          tweets = re.sub(r'\d+', '', tweets)
          tweets = re.sub(' +', ' ', tweets)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
     
     return df

def removing_stopwords(df):
     for tweets in df.loc[:,'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tokens = tokenizer.tokenize(tweets)
          filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
          tweets = ' '.join(filtered_tokens)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def lemmatization(df):
     for tweets in df.loc[:,'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tokens = tokenizer.tokenize(tweets)
          lemmatizer = WordNetLemmatizer()
          lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
          tweets = ' '.join(lemmatized_tokens)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def removing_single_characters(df):
     for tweets in df.loc[:,'tweet']:
          
          tokenizer = TweetTokenizer()
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          tokens = tokenizer.tokenize(tweets)
          filtered_tokens = [word for word in tokens if len(word) > 1]
          tweets = ' '.join(filtered_tokens)
          
          df.loc[df['tweet_id']==tweet_id, 'tweet'] = tweets
          
     return df

def removing_tweets_less_5(df):
     for tweets in df.loc[:, 'tweet']:
          
          tweet_id = df.loc[df['tweet'] == tweets, 'tweet_id'].values[0]
          
          if len(tweets.split()) < 5:
               df.drop(df[df['tweet_id']==tweet_id].index, inplace=True)
     return df

def removing_duplicates(df):
     df = df.drop_duplicates(subset=['tweet'])
     return df

def data_preprocessing(df):
     
     df = removing_links(df)
     df = contractions_handling(df)
     df = adding_space_bw_words_punc(df)
     df = removing_hashtags_mentions(df)
     df = removing_punctuations_emojis(df)
     df = removing_numbers(df)
     df = lemmatization(df)
     df = removing_stopwords(df)
     df = removing_single_characters(df)
     df = removing_tweets_less_5(df)
     df = removing_duplicates(df)
     df.reset_index(drop=True, inplace=True)
     
     return df

In [None]:
tweets_cleaned_df = data_preprocessing(tweets_no_dupl_df)
print(tweets_cleaned_df.shape)
tweets_cleaned_df.head()

# Handling Abbreviations

In [14]:
abbs_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/Twitter_Data_Analysis/v2/data/text_abbreviations_v2.csv')
abbs_df.drop('Unnamed: 0', axis=1, inplace=True)
abbs_df.head()

Unnamed: 0,text,meaning
0,;S,"Gentle warning, like ""Hmm? What did you say?"""
1,?,I have a question
2,?,I don't understand what you mean
3,?4U,I have a question for you
4,.02,My (or your) two cents worth


# Handing Hinglish


In [None]:
def hinglish_to_english(df, lang):
    translated_val = list()
    count = 0
    # translating the text to english using GoogleTranslator API
    for x in df['tweet']:
        count += 1
        try:
            # GoogleTranslator API has a limit of 5000 characters per request, so splitting the text into chunks of 5000 characters and then translating it
            if len(str(x))<5000:
                
                translation = GoogleTranslator(source=lang, target='en').translate(x)
                translated_val.append(translation)
            
            elif len(str(x))>5000 and len(str(x))<10000:
                
                split_x = [x[i:i+4999] for i in range(0, len(x), 4999)]
                translation_1 = GoogleTranslator(source=lang, target='en').translate(split_x[0])
                translation_2 = GoogleTranslator(source=lang, target='en').translate(split_x[1])
                translation = translation_1 + translation_2
                translated_val.append(translation)
                
        except Exception as e:
            # if the text is not in the language provided, then it will return a nan value or text length is more than 15000 characters
            translated_val.append(np.nan)
        if count%1000==0:
            print(count)
    
    # replacing the original text with the translated text
    df['tweet_translated'] = translated_val
    
    # returning the updated dataframe
    return df
            

In [None]:
translated_df = hinglish_to_english(tweets_cleaned_df, 'hi')
translated_df.head()

In [None]:
pd.DataFrame.to_csv(translated_df, '/Users/nitanshjain/Documents/Projects/Twitter_Data_Analysis/v2/data/tweets_translated_v2.csv', index=False)

In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', min_df=10, ngram_range=(1,1))
data_clean_cv = cv.fit_transform(translated_df.tweet)
# data_clean_cv.toarray()
data_clean_dtm = pd.DataFrame(data_clean_cv.toarray(), columns=cv.get_feature_names_out())
data_clean_dtm.index = translated_df.index
data_clean_dtm

In [None]:
# creating csv of cleaned dataset
pd.DataFrame.to_csv(tweets_cleaned_df, '/Users/nitanshjain/Documents/Projects/Twitter_Data_Analysis/v2/data/tweets_cleaned_with_stopwords_v2.csv', index=False)