#**Data Preprocessing for Authorship Analysis**

##**Load Dataset**

In [56]:
import pandas as pd

#data_age = pd.read_csv("/content/drive/MyDrive/Authorship/age_dataset.csv", quotechar='|', lineterminator='\n', low_memory=False)
#data_gender = pd.read_csv("/content/drive/MyDrive/Authorship/genders_dataset.csv", quotechar='|', lineterminator='\n', low_memory=False)
data_politics = pd.read_csv("/content/drive/MyDrive/Authorship/politics_dataset.csv", quotechar='|', lineterminator='\n', low_memory=False)

In [58]:
data_politics

Unnamed: 0,username,user_id,tweet_id,tweet,political_stance
0,fuchsofficial,2309779899,1.409303e+18,@andielcfcsutton @lcfcgeorgee People tag me at...,dem
1,fuchsofficial,2309779899,1.408952e+18,Thank you Mike! It’s my turn for the next fund...,dem
2,fuchsofficial,2309779899,1.408931e+18,@lcfcgeorgee You sure? No other player ever? H...,dem
3,fuchsofficial,2309779899,1.408764e+18,It has been an amazing experience to visit @We...,dem
4,fuchsofficial,2309779899,1.407433e+18,Dave Merrell has captured my pass in another a...,dem
...,...,...,...,...,...
444937,sjcycling,1023573182,1.344784e+18,@littledeekay You are very brave ❤️,dem
444938,sjcycling,1023573182,1.344685e+18,@JRubinBlogger So well said! I couldn’t agree ...,dem
444939,sjcycling,1023573182,1.344427e+18,I really don’t like robotic #RadicalKelly,dem
444940,sjcycling,1023573182,1.344393e+18,@HawleyMO Your transparent political posturing...,dem


In [59]:
df = data_politics
df['tweet'].dropna(inplace=True)
df['tweet_str'] = df['tweet'].astype(str)
df

Unnamed: 0,username,user_id,tweet_id,tweet,political_stance,tweet_str
0,fuchsofficial,2309779899,1.409303e+18,@andielcfcsutton @lcfcgeorgee People tag me at...,dem,@andielcfcsutton @lcfcgeorgee People tag me at...
1,fuchsofficial,2309779899,1.408952e+18,Thank you Mike! It’s my turn for the next fund...,dem,Thank you Mike! It’s my turn for the next fund...
2,fuchsofficial,2309779899,1.408931e+18,@lcfcgeorgee You sure? No other player ever? H...,dem,@lcfcgeorgee You sure? No other player ever? H...
3,fuchsofficial,2309779899,1.408764e+18,It has been an amazing experience to visit @We...,dem,It has been an amazing experience to visit @We...
4,fuchsofficial,2309779899,1.407433e+18,Dave Merrell has captured my pass in another a...,dem,Dave Merrell has captured my pass in another a...
...,...,...,...,...,...,...
444937,sjcycling,1023573182,1.344784e+18,@littledeekay You are very brave ❤️,dem,@littledeekay You are very brave ❤️
444938,sjcycling,1023573182,1.344685e+18,@JRubinBlogger So well said! I couldn’t agree ...,dem,@JRubinBlogger So well said! I couldn’t agree ...
444939,sjcycling,1023573182,1.344427e+18,I really don’t like robotic #RadicalKelly,dem,I really don’t like robotic #RadicalKelly
444940,sjcycling,1023573182,1.344393e+18,@HawleyMO Your transparent political posturing...,dem,@HawleyMO Your transparent political posturing...


## **Data Preprocessing**
Data cleaning procedure before analysis:
*   Remove URLs and HTML links
*   Remove mention (‘@\w+’)
*   Remove numbers
*   Remove hashtags ('#\w+')
*   Remove punctuation
*   Remove Emoji and Emotions
*   Remove stopwords
*   Change to lower case
*   Remove extra white space left
*   Combine tweets of a single user into a single document
*   Tokenize the corpus and create the vocabulary








####**Remove URLs and HTML links**

In [60]:
# Remove URLs and HTML links
import re
def check_urls(text):
    return re.findall(r'https?://\S+|www\.\S+',text)

def remove_urls(text):
    url_remove = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(r'https?://\S+|www\.\S+','',text)

df['check']=df['tweet_str'].apply(lambda x:check_urls(x))
print("-----check-----")
for x in df['check'][:4]:
  print(x) 

df['tweet_str']=df['tweet_str'].apply(lambda x:remove_urls(x))
print("\n-----result-----")
for x in df['tweet_str'][:4]:
  print("*** "+x)

-----check-----
['https://t.co/jvrndAIpZo']
[]
[]
['https://t.co/aJyVmdosjw']

-----result-----
*** @andielcfcsutton @lcfcgeorgee People tag me at times doi  
*** Thank you Mike! It’s my turn for the next fundraiser! 💪🏼 #Charity #fundraising
*** @lcfcgeorgee You sure? No other player ever? He has a couple of years to go before he can claim that title! 🤔😉
*** It has been an amazing experience to visit @WestPoint_USMA. Thank you for hosting and letting me do some combat simulation drills @armywpesports. Looking forward to come back! Thank you for your service! @USArmyesports @USArmy  #USArmy @GoArmyWestPoint @ArmyWP_WSocc  


####**Remove mention (‘@\w+’)**


In [61]:
# Remove mention
import re
def check_mention(x):
    return re.findall(r'@\w+',x)
def remove_mention(x):
    return re.sub(r'@\w+','',x)

df['check']=df['tweet_str'].apply(lambda x:check_mention(x))
print("-----check-----")
for x in df['check'][:4]:
  print(x) 

df['tweet_str']=df['tweet_str'].apply(lambda x:remove_mention(x))
print("\n-----result-----")
for x in df['tweet_str'][:4]:
  print("*** "+x) 

-----check-----
['@andielcfcsutton', '@lcfcgeorgee']
[]
['@lcfcgeorgee']
['@WestPoint_USMA', '@armywpesports', '@USArmyesports', '@USArmy', '@GoArmyWestPoint', '@ArmyWP_WSocc']

-----result-----
***   People tag me at times doi  
*** Thank you Mike! It’s my turn for the next fundraiser! 💪🏼 #Charity #fundraising
***  You sure? No other player ever? He has a couple of years to go before he can claim that title! 🤔😉
*** It has been an amazing experience to visit . Thank you for hosting and letting me do some combat simulation drills . Looking forward to come back! Thank you for your service!    #USArmy    


####**Remove numbers**


In [62]:
#Remove numbers

def check_num(text):
    return re.findall(r'\d+',text)
def remove_num(text):
    return re.sub(r'\d+','',text)
    
df['check']=df['tweet_str'].apply(lambda x:check_num(x))
print("-----check-----")
[print(x) for x in df['check'][:4]]

df['tweet_str']=df['tweet_str'].apply(lambda x:remove_num(x))
print("\n-----result-----")
[print("*** "+x) for x in df['tweet_str'][:4]]

-----check-----
[]
[]
[]
[]

-----result-----
***   People tag me at times doi  
*** Thank you Mike! It’s my turn for the next fundraiser! 💪🏼 #Charity #fundraising
***  You sure? No other player ever? He has a couple of years to go before he can claim that title! 🤔😉
*** It has been an amazing experience to visit . Thank you for hosting and letting me do some combat simulation drills . Looking forward to come back! Thank you for your service!    #USArmy    


[None, None, None, None]

####**Remove hashtags ('#\w+')**


In [63]:
# Remove hashtags

def check_hash(text):
    return re.findall(r'#\w+',text)
def remove_hash(text):
    return re.sub(r'#\w+','',text)

df['check']=df['tweet_str'].apply(lambda x:check_hash(x))
print("-----check-----")
[print(x) for x in df['check'][:4]]

df['tweet_str']=df['tweet_str'].apply(lambda x:remove_hash(x))
print("\n-----result-----")
[print("*** "+x) for x in df['tweet_str'][:4]]

-----check-----
[]
['#Charity', '#fundraising']
[]
['#USArmy']

-----result-----
***   People tag me at times doi  
*** Thank you Mike! It’s my turn for the next fundraiser! 💪🏼  
***  You sure? No other player ever? He has a couple of years to go before he can claim that title! 🤔😉
*** It has been an amazing experience to visit . Thank you for hosting and letting me do some combat simulation drills . Looking forward to come back! Thank you for your service!        


[None, None, None, None]

####**Remove punctuation**


In [64]:
# Remove punctuation

def check_punct(text):
    return re.findall(r'[^\w\s\d]',text)
def remove_punct(text):
    return re.sub(r"[^\w\s\d]","", text)

df['check']=df['tweet_str'].apply(lambda x:check_punct(x))
print("-----check-----")
[print(x) for x in df['tweet_str'][:4]]

df['tweet_str']=df['tweet_str'].apply(lambda x:remove_punct(x))
print("\n-----result-----")
[print("*** "+x) for x in df['tweet_str'][:4]]

-----check-----
  People tag me at times doi  
Thank you Mike! It’s my turn for the next fundraiser! 💪🏼  
 You sure? No other player ever? He has a couple of years to go before he can claim that title! 🤔😉
It has been an amazing experience to visit . Thank you for hosting and letting me do some combat simulation drills . Looking forward to come back! Thank you for your service!        

-----result-----
***   People tag me at times doi  
*** Thank you Mike Its my turn for the next fundraiser   
***  You sure No other player ever He has a couple of years to go before he can claim that title 
*** It has been an amazing experience to visit  Thank you for hosting and letting me do some combat simulation drills  Looking forward to come back Thank you for your service        


[None, None, None, None]

####**Remove Emoji and Emotions**


In [65]:
#Check emoji
def check_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return re.findall(emoji_pattern, text)

df['check']=df['tweet_str'].apply(lambda x:check_emoji(x))
print("-----check-----")
[print(x) for x in df['check'][:4]]

-----check-----
[]
[]
[]
[]


[None, None, None, None]

In [66]:
#Remove Emoji and Emotions

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df['tweet_str']=df['tweet_str'].apply(lambda x: remove_emoji(x))

print("\n-----result-----")
[print("*** "+x) for x in df['tweet_str'][:4]]


-----result-----
***   People tag me at times doi  
*** Thank you Mike Its my turn for the next fundraiser   
***  You sure No other player ever He has a couple of years to go before he can claim that title 
*** It has been an amazing experience to visit  Thank you for hosting and letting me do some combat simulation drills  Looking forward to come back Thank you for your service        


[None, None, None, None]

####**Remove accented characters**

In [67]:
import unicodedata
def remove_accent(text):
    try:
        text = unicode(text, 'utf-8')
    except NameError:
        pass
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    return str(text)
df['tweet_str']=df['tweet_str'].apply(lambda x: remove_accent(x))

In [68]:
!pip install contractions
import contractions

df['tweet_str']=df['tweet_str'].apply(lambda x: contractions.fix(x))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


####**Remove stopwords**

In [69]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [70]:
# Remove stopwords
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
df['tweet_str']=df['tweet_str'].apply(lambda x:remove_stopwords(x))
print("\n-----result-----")
[print("*** "+x) for x in df['tweet_str'][:4]]


-----result-----
*** People tag times doi
*** Thank Mike Its turn next fundraiser
*** You sure No player ever He couple years go claim title
*** It amazing experience visit Thank hosting letting combat simulation drills Looking forward come back Thank service


[None, None, None, None]

####**Change to lower case**

In [71]:
# Change to lower case
df['tweet_str']=df['tweet_str'].apply(lambda x: x.lower())
[print("*** "+x) for x in df['tweet_str'][:4]]

*** people tag times doi
*** thank mike its turn next fundraiser
*** you sure no player ever he couple years go claim title
*** it amazing experience visit thank hosting letting combat simulation drills looking forward come back thank service


[None, None, None, None]

####**Remove extra white space left**


In [72]:
# Remove extra white space left
def remove_space(text):
    return re.sub(r"\s+"," ",text).strip()
df['tweet_str']=df['tweet_str'].apply(lambda x:remove_space(x))
print("\n-----result-----")
[print("*** "+x) for x in df['tweet_str'][:4]]


-----result-----
*** people tag times doi
*** thank mike its turn next fundraiser
*** you sure no player ever he couple years go claim title
*** it amazing experience visit thank hosting letting combat simulation drills looking forward come back thank service


[None, None, None, None]

####**Combine tweets of a single user into a single document**
Combine all tweets of a single user into a single document and treat each tweet as an individual sentence, and use the combined result as the corpus


In [73]:
corpus = df['tweet_str']
corpus

0                                      people tag times doi
1                       thank mike its turn next fundraiser
2         you sure no player ever he couple years go cla...
3         it amazing experience visit thank hosting lett...
4         dave merrell captured pass another amazing lim...
                                ...                        
444937                                            you brave
444938    so well said i could agree i am already seeing...
444939                                i really like robotic
444940    your transparent political posturing pretty pa...
444941                                                  nan
Name: tweet_str, Length: 444942, dtype: object

In [74]:
new_df = df.groupby('user_id').agg({'tweet_str' : ' '.join, 'political_stance': 'first'})
new_df

Unnamed: 0_level_0,tweet_str,political_stance
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
7204,he always look like hes trying figure crapped ...,dem
755086,not like bitcoin are sure loons xeni logging o...,dem
813286,libraries citadels knowledge empathyand played...,dem
939091,the science clear the best way protect virus v...,dem
1917731,times up says bill cosby guilty regardless pen...,dem
...,...,...
1410635445596307463,thanks,dem
1410636559590707202,attorneys company downplayed criminal charges ...,dem
1410656438372950016,just breath,dem
1410662136859271171,she dole mccarthy man in dreams the legal syst...,dem


####**Tokenize the corpus and create the vocabulary**

In [75]:
#tokenized_corpus
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)


In [76]:
# Create vocabulary
def create_vocabulary(token_list): 
  vocabulary = []
  for sentence in token_list:
    for token in sentence:
      if token not in vocabulary:
        vocabulary.append(token)
  
  word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
  idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}
  return vocabulary, word2idx, idx2word


In [77]:
vocabulary_politics, word2idx_politics, idx2word_politics = create_vocabulary(tokenized_corpus)
#vocabulary_age, word2idx_age, idx2word_age = create_vocabulary(tokenized_corpus)
#vocabulary_gender, word2idx_gender, idx2word_gender = create_vocabulary(tokenized_corpus)
print(len(vocabulary_politics))

95408
