In [1]:
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')

import re


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
#loading the dataset
df=pd.read_csv('cleaned_translation_data_1.csv')

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,Gender,Country,Hour,Day,Weekday,IsReshare,Reach,RetweetCount,Likes,Klout,Sentiment,Lang,text,clean_text,translated_text,valid_words
0,0,Unknown,Albania,7,12,Friday,True,339.0,127.0,0.0,44.0,0.0,en,"""RT @AdrianRusso82: Our Innovation Lab is offi...",our innovation lab officially open click learn...,our innovation lab officially open click learn...,"['innovation', 'lab', 'officially', 'open', 'c..."
1,1,Male,Albania,11,7,Thursday,False,87.0,0.0,0.0,22.0,0.0,en,Now Open AWS Asia Pacific (Seoul) Region via ...,now open aws asia pacific seoul region via are...,now open aws asia pacific seoul region via are...,"['open', 'pacific', 'region', 'via']"
2,2,Male,Albania,6,12,Tuesday,False,87.0,0.0,0.0,22.0,0.0,en,A Beginner's Guide to Scaling to 11 Million+ U...,a beginners guide scaling million users amazon...,a beginners guide scaling million users amazon...,"['guide', 'scaling', 'million', 'via']"
3,3,Male,Albania,10,12,Tuesday,False,87.0,0.0,0.0,22.0,0.0,en,Bridging AWS and Azure environments via VPN vi...,bridging aws azure environments via vpn via ar...,bridging aws azure environments via vpn via ar...,"['bridging', 'azure', 'via', 'via']"
4,4,Male,Albania,9,21,Thursday,False,85.0,0.0,0.0,21.0,0.0,en,ELK on AWS ElasticSearch Service + ElasticBean...,elk aws elasticsearch service elasticbeanstalk...,elk aws elasticsearch service elasticbeanstalk...,"['elk', 'service', 'via']"


In [22]:
# dropping columns
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [23]:
df.head()

Unnamed: 0,Gender,Country,Hour,Day,Weekday,IsReshare,Reach,RetweetCount,Likes,Klout,Sentiment,Lang,text,clean_text,translated_text,valid_words
0,Unknown,Albania,7,12,Friday,True,339.0,127.0,0.0,44.0,0.0,en,"""RT @AdrianRusso82: Our Innovation Lab is offi...",our innovation lab officially open click learn...,our innovation lab officially open click learn...,"['innovation', 'lab', 'officially', 'open', 'c..."
1,Male,Albania,11,7,Thursday,False,87.0,0.0,0.0,22.0,0.0,en,Now Open AWS Asia Pacific (Seoul) Region via ...,now open aws asia pacific seoul region via are...,now open aws asia pacific seoul region via are...,"['open', 'pacific', 'region', 'via']"
2,Male,Albania,6,12,Tuesday,False,87.0,0.0,0.0,22.0,0.0,en,A Beginner's Guide to Scaling to 11 Million+ U...,a beginners guide scaling million users amazon...,a beginners guide scaling million users amazon...,"['guide', 'scaling', 'million', 'via']"
3,Male,Albania,10,12,Tuesday,False,87.0,0.0,0.0,22.0,0.0,en,Bridging AWS and Azure environments via VPN vi...,bridging aws azure environments via vpn via ar...,bridging aws azure environments via vpn via ar...,"['bridging', 'azure', 'via', 'via']"
4,Male,Albania,9,21,Thursday,False,85.0,0.0,0.0,21.0,0.0,en,ELK on AWS ElasticSearch Service + ElasticBean...,elk aws elasticsearch service elasticbeanstalk...,elk aws elasticsearch service elasticbeanstalk...,"['elk', 'service', 'via']"


In [24]:
df['Lang'].value_counts()

Lang
en    93428
es     1812
fr     1097
_u      925
tr      899
ja      676
de      570
in      427
tl      421
ko      365
pt      239
nl      174
it      148
zh      124
th       81
pl       78
cs       74
ru       57
fi       51
et       42
ro       35
cy       29
ht       29
da       28
sv       28
ar       22
hu       20
no       13
lt        8
lv        6
hi        3
iw        2
sl        2
uk        2
eu        1
km        1
Name: count, dtype: int64

In [25]:
df.shape

(101917, 16)

#### At this point we are decided to drop the rows by using `Lang` column which are not translated in english as most of them are translated in it.

In [26]:
# List of substrings to remove
substrings_to_remove = ['_u','es', 'ht','in','it','ja','nl','pt','ro','tl','tr','zh']
# Create a regex pattern to match any of the substrings
pattern = '|'.join(substrings_to_remove)
# Remove rows where 'Lang' contains any of the substrings
df = df[~df['Lang'].str.contains(pattern)]

In [27]:
df.shape

(96008, 16)

# Data Engineering

In [28]:
# Define the pattern to remove sequences of single characters separated by spaces and any non-word characters
pattern = r'\b(?:\w\s*){1,3}\b|\W+'

# Custom function to clean text
def clean_text(text):
    if not isinstance(text, str):
        text = str(text) if text is not None else ''
    # Replace specific patterns with a space
    cleaned_text = re.sub(pattern, ' ', text)
    # Remove any extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

# Apply the custom function to the 'clean_text' column
df['clean_text'] = df['clean_text'].apply(clean_text)
df['translated_text'] = df['translated_text'].apply(clean_text)

In [29]:
# Function to tokenize the text
def tokenize_text(text):
    if isinstance(text, str):  # Check if the text is a string
        tokens = word_tokenize(text)
        return tokens
    else:
        return []  # Return an empty list for non-string values

# Apply the function to the 'translated_text' column
df['tokenize_text'] = df['translated_text'].apply(tokenize_text)
df

Unnamed: 0,Gender,Country,Hour,Day,Weekday,IsReshare,Reach,RetweetCount,Likes,Klout,Sentiment,Lang,text,clean_text,translated_text,valid_words,tokenize_text
0,Unknown,Albania,7,12,Friday,True,339.0,127.0,0.0,44.0,0.0,en,"""RT @AdrianRusso82: Our Innovation Lab is offi...",innovation officially open click learn tech ja...,innovation officially open click learn tech ja...,"['innovation', 'lab', 'officially', 'open', 'c...","[innovation, officially, open, click, learn, t..."
1,Male,Albania,11,7,Thursday,False,87.0,0.0,0.0,22.0,0.0,en,Now Open AWS Asia Pacific (Seoul) Region via ...,open asia pacific seoul region aresysadmin,open asia pacific seoul region aresysadmin,"['open', 'pacific', 'region', 'via']","[open, asia, pacific, seoul, region, aresysadmin]"
2,Male,Albania,6,12,Tuesday,False,87.0,0.0,0.0,22.0,0.0,en,A Beginner's Guide to Scaling to 11 Million+ U...,beginners guide scaling million users amazons ...,beginners guide scaling million users amazons ...,"['guide', 'scaling', 'million', 'via']","[beginners, guide, scaling, million, users, am..."
3,Male,Albania,10,12,Tuesday,False,87.0,0.0,0.0,22.0,0.0,en,Bridging AWS and Azure environments via VPN vi...,bridging azure environments aresysadmin,bridging azure environments aresysadmin,"['bridging', 'azure', 'via', 'via']","[bridging, azure, environments, aresysadmin]"
4,Male,Albania,9,21,Thursday,False,85.0,0.0,0.0,21.0,0.0,en,ELK on AWS ElasticSearch Service + ElasticBean...,elasticsearch service elasticbeanstalk laravel...,elasticsearch service elasticbeanstalk laravel...,"['elk', 'service', 'via']","[elasticsearch, service, elasticbeanstalk, lar..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101912,Male,Zimbabwe,10,7,Sunday,True,1203.0,3.0,0.0,44.0,3.0,en,RT @Springcoil: Does anyone have a good workfl...,does anyone good workflow using data science t...,does anyone good workflow using data science t...,"['anyone', 'good', 'data', 'science', 'spend',...","[does, anyone, good, workflow, using, data, sc..."
101913,Unknown,Zimbabwe,5,12,Tuesday,True,976.0,102.0,0.0,31.0,0.0,en,RT @linuxacademyCOM: AWS CSA Associate and Pro...,associate professional training available labs...,associate professional training available labs...,"['associate', 'professional', 'training', 'ava...","[associate, professional, training, available,..."
101914,Unknown,Zimbabwe,0,15,Tuesday,False,4876.0,0.0,0.0,49.0,1.0,en,Curious @benthompson does this change your opi...,curious change opinion efficient dropbox runni...,curious change opinion efficient dropbox runni...,"['curious', 'change', 'opinion', 'efficient', ...","[curious, change, opinion, efficient, dropbox,..."
101915,Unisex,Zimbabwe,8,2,Wednesday,True,188.0,5.0,0.0,37.0,0.0,en,RT @awscloud: New on the AWS Startup Blog - Wh...,startup blog what startups should know before ...,startup blog what startups should know before ...,"['new', 'know', 'choosing']","[startup, blog, what, startups, should, know, ..."


In [30]:
# dropping columns
df.drop(['Gender','Country','Hour','Day','Weekday','IsReshare','Reach','RetweetCount','Likes','Klout','Lang','text','valid_words'],axis=1,inplace=True)

In `Sentiment` column, the values are not well defined. So, We are dividing them into 3 categories and then applying mapping.

In [31]:
# the values are not well defined. So, We are dividing them into 3 categories and then applying mapping.
df['Mood'] = df['Sentiment'].apply(lambda x: 
                                       ('Positive' if x > 0 
                                             else ('Neutral' if x == 0 
                                                   else  'Negative')))

In [32]:
# Define a custom mapping for Mood to numerical values
mood_mapping = {
    'Neutral': 0,
    'Positive': 1,
    'Negative': 2
}

# Apply the custom mapping to create the 'Mood_Encoded' column
df['Mood'] = df['Mood'].map(mood_mapping)

In [33]:
# Handle missing values by replacing NaNs with an empty string
df['translated_text'] = df['translated_text'].fillna('')

# Convert non-string values to string
df['translated_text'] = df['translated_text'].astype(str)

In [34]:
df.drop('Sentiment',axis=1,inplace=True)

In [35]:
df.shape

(96008, 4)

In [36]:
df.head(5)

Unnamed: 0,clean_text,translated_text,tokenize_text,Mood
0,innovation officially open click learn tech ja...,innovation officially open click learn tech ja...,"[innovation, officially, open, click, learn, t...",0
1,open asia pacific seoul region aresysadmin,open asia pacific seoul region aresysadmin,"[open, asia, pacific, seoul, region, aresysadmin]",0
2,beginners guide scaling million users amazons ...,beginners guide scaling million users amazons ...,"[beginners, guide, scaling, million, users, am...",0
3,bridging azure environments aresysadmin,bridging azure environments aresysadmin,"[bridging, azure, environments, aresysadmin]",0
4,elasticsearch service elasticbeanstalk laravel...,elasticsearch service elasticbeanstalk laravel...,"[elasticsearch, service, elasticbeanstalk, lar...",0


In [37]:
df.to_csv("final_cleaned_translation_data.csv",index=True)