# Preliminaries and setup

1. Install and/or load required packages.
1. Load source data for *Challenge #2* provided by Piper project team.

## Install/Load required packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Raw Data

Load in the various **Piper $\nabla$** challenge data sets:

In [3]:
# Data files

urlC1a = ('https://drive.google.com/file/d/1Bl8O9FAYVHtIK4cdvqHAymeLFZUV9DZ0/view', 'Ch1 small')
urlC1b = ('https://drive.google.com/file/d/1BitVVnJweFfP38i-5p8FdsrN8Rr1DYh7/view', 'Ch1 medium')
urlC1c = ('https://drive.google.com/file/d/1iR447fRNgxIHkkE8oErv7ke3IDhGulIG/view', 'Ch1 large')

urlC2a = ('https://drive.google.com/file/d/12FPSxVnxzmvkxnS3nMD3MXKKIwxSytFf/view', 'Ch2 small')
urlC2c = ('https://drive.google.com/file/d/123oxrOJcUd2FeDmR6QBlznbTdwkcYpQk/view', 'Ch2 large')


Load data for *Challenge #2* (uncomment desired data set size): 

In [4]:
def read_from_url(url):
    return 'https://drive.google.com/uc?id='+url.split('/')[-2]

# read in C2 large file

# url = urlC2c[0]
# offline_tweets_df = pd.read_csv(read_from_url(url))

# read in C2 small file

# url = urlC2a[0]
url = urlC2c[0]
offline_tweets_df = pd.read_csv(read_from_url(url), index_col=0)

display(offline_tweets_df)


Unnamed: 0,id,created_at,reply_to_id,coordinates,place,retweet_count,favorite_count,lang,source,is_quote_status,user_id,user_loc,verified,followers_count,friends_count,favourites_count,statuses_count,listed_count,user_created_at,screen_name,user_description,text,tweet category
0,5/6/2021 19:30,1390388447161065472,0,,,332,642,en,Twitter for iPhone,False,1.209417e+09,"Jefferson, LA",True,458947,1291,2435,12962,2662,2/22/2013 18:10,SteveScalise,b'House Republican Whip. Dad. LSU & Saints fan...,b'Biden wants $6 trillion in new spending.\n\n...,2.0
1,5/6/2021 18:46,1390377523637362696,0,,,426,1110,en,Twitter for iPhone,False,2.114966e+07,Texas,True,94664,165,4474,8643,539,2/18/2009 0:09,ArthurSchwartz,b'',b'Biden pushes another preposterous lie about ...,2.0
2,5/6/2021 18:41,1390376273902514177,0,,,101,279,en,Twitter Media Studio,False,5.532916e+07,"Washington, DC",True,96369,3294,8,21756,1223,7/9/2009 19:08,RNCResearch,"b'Exposing the lies, hypocrisy, and failed far...","b'Joe Biden falsely claims his ""infrastructure...",2.0
3,5/7/2021 13:28,1390659692439871488,1390649154599280642,,,0,0,en,Twitter for iPhone,False,2.451320e+08,,False,471,2783,51699,48991,20,1/30/2011 23:38,EEstaris,"b""Adversity does not test one's character, it ...",b'@kingbuddah4 @CNBC Can we just pass an infra...,0.0
4,5/7/2021 13:27,1390659674073096192,1390658945585451009,,,0,1,en,Twitter Web App,False,9.570000e+17,,False,622,1540,54311,70486,7,1/28/2018 0:28,SandraKM123,b'Semi retired. Public Health MD MPH MS Hobby...,b'@jsolomonReports I would agree about Infrast...,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16633,4/30/2021 2:35,1387958778868473857,0,,,203,0,en,Twitter for Android,False,1.320000e+18,,False,672,2712,28084,12369,3,10/12/2020 3:18,brodon0511,Biden/Harris2020💙💙politics is a zero sum game💙...,"b""RT @NBCNews: U.S. Senate overwhelmingly pass...",
16634,4/30/2021 2:34,1387958550924693504,1387950272346042368,,,1,2,en,Twitter for Android,False,6.039407e+07,Earth,False,339,136,7324,52961,54,7/26/2009 20:53,mikegonzalez2k,"Android Developer, Physics and AI Programmer, ...","b""@tedcruz Ted was the only other vote against...",
16635,4/30/2021 2:34,1387958544616640513,0,,,2123,0,en,Twitter for iPhone,False,1.380000e+18,,False,733,833,3707,332,2,3/27/2021 18:35,agoodforbetter,Progressive from Sconnie #Resist #FightTheStat...,b'RT @mmpadellan: Republicans are against:\n- ...,-2.0
16636,4/30/2021 2:34,1387958480712044544,0,,,29,0,en,Twitter Web App,True,7.090000e+17,Bremerton WA,False,2601,2702,244069,489884,53,3/13/2016 6:11,painter_nancy,"I design fabric, fight evil & read dead people...","b""RT @dick_nixon: Ocasio-Cortez wants a $10 tr...",


In [5]:
# Correct swapped `id` and `created_at` column names
offline_tweets_df.rename(columns={'id':'created_at', 'created_at':'id'}, inplace=True)

offline_tweets_df.head()

Unnamed: 0,created_at,id,reply_to_id,coordinates,place,retweet_count,favorite_count,lang,source,is_quote_status,user_id,user_loc,verified,followers_count,friends_count,favourites_count,statuses_count,listed_count,user_created_at,screen_name,user_description,text,tweet category
0,5/6/2021 19:30,1390388447161065472,0,,,332,642,en,Twitter for iPhone,False,1209417000.0,"Jefferson, LA",True,458947,1291,2435,12962,2662,2/22/2013 18:10,SteveScalise,b'House Republican Whip. Dad. LSU & Saints fan...,b'Biden wants $6 trillion in new spending.\n\n...,2.0
1,5/6/2021 18:46,1390377523637362696,0,,,426,1110,en,Twitter for iPhone,False,21149660.0,Texas,True,94664,165,4474,8643,539,2/18/2009 0:09,ArthurSchwartz,b'',b'Biden pushes another preposterous lie about ...,2.0
2,5/6/2021 18:41,1390376273902514177,0,,,101,279,en,Twitter Media Studio,False,55329160.0,"Washington, DC",True,96369,3294,8,21756,1223,7/9/2009 19:08,RNCResearch,"b'Exposing the lies, hypocrisy, and failed far...","b'Joe Biden falsely claims his ""infrastructure...",2.0
3,5/7/2021 13:28,1390659692439871488,1390649154599280642,,,0,0,en,Twitter for iPhone,False,245132000.0,,False,471,2783,51699,48991,20,1/30/2011 23:38,EEstaris,"b""Adversity does not test one's character, it ...",b'@kingbuddah4 @CNBC Can we just pass an infra...,0.0
4,5/7/2021 13:27,1390659674073096192,1390658945585451009,,,0,1,en,Twitter Web App,False,9.57e+17,,False,622,1540,54311,70486,7,1/28/2018 0:28,SandraKM123,b'Semi retired. Public Health MD MPH MS Hobby...,b'@jsolomonReports I would agree about Infrast...,1.0


# Data Parsing

1. Initial data cleaning
1. Parsing mentions, retweets, hastags, and web links

## Intial Data Cleaning

In [6]:
# replace Nans in user_description with empty strings
offline_tweets_df.user_description.fillna('', inplace=True)
print('Nans replaced')

# convert comments ("text") from byte encoding to utf8 
import ast
offline_tweets_df['text2']=offline_tweets_df['text'].apply(lambda x: ast.literal_eval(x).decode('utf-8'))
#offline_tweets_df['userdescr']=offline_tweets_df['user_description'].apply(lambda x: ast.literal_eval(x).decode('utf-8'))
display(offline_tweets_df[['user_description','text','text2']])

Nans replaced


Unnamed: 0,user_description,text,text2
0,b'House Republican Whip. Dad. LSU & Saints fan...,b'Biden wants $6 trillion in new spending.\n\n...,Biden wants $6 trillion in new spending.\n\nWh...
1,b'',b'Biden pushes another preposterous lie about ...,Biden pushes another preposterous lie about hi...
2,"b'Exposing the lies, hypocrisy, and failed far...","b'Joe Biden falsely claims his ""infrastructure...","Joe Biden falsely claims his ""infrastructure"" ..."
3,"b""Adversity does not test one's character, it ...",b'@kingbuddah4 @CNBC Can we just pass an infra...,@kingbuddah4 @CNBC Can we just pass an infrast...
4,b'Semi retired. Public Health MD MPH MS Hobby...,b'@jsolomonReports I would agree about Infrast...,@jsolomonReports I would agree about Infrastru...
...,...,...,...
16633,Biden/Harris2020💙💙politics is a zero sum game💙...,"b""RT @NBCNews: U.S. Senate overwhelmingly pass...",RT @NBCNews: U.S. Senate overwhelmingly passes...
16634,"Android Developer, Physics and AI Programmer, ...","b""@tedcruz Ted was the only other vote against...",@tedcruz Ted was the only other vote against t...
16635,Progressive from Sconnie #Resist #FightTheStat...,b'RT @mmpadellan: Republicans are against:\n- ...,RT @mmpadellan: Republicans are against:\n- Th...
16636,"I design fabric, fight evil & read dead people...","b""RT @dick_nixon: Ocasio-Cortez wants a $10 tr...",RT @dick_nixon: Ocasio-Cortez wants a $10 tril...


In [7]:
# convert user_description from byte encoding to utf8 

# not all rows are byte-encoded so need to create a mask:
is_byte = (offline_tweets_df.user_description.str.startswith("b'") | offline_tweets_df.user_description.str.startswith('b"'))

# vector of clean utf8 encoded user_descriptions
clean_ud = offline_tweets_df.loc[is_byte,'user_description'].apply(lambda x: ast.literal_eval(x).decode('utf-8'))
print(len(clean_ud))

clean_ud.head()

offline_tweets_df.loc[is_byte, 'user_description'] = clean_ud
display(offline_tweets_df[['user_description','text','text2']])


15981


Unnamed: 0,user_description,text,text2
0,House Republican Whip. Dad. LSU & Saints fan. ...,b'Biden wants $6 trillion in new spending.\n\n...,Biden wants $6 trillion in new spending.\n\nWh...
1,,b'Biden pushes another preposterous lie about ...,Biden pushes another preposterous lie about hi...
2,"Exposing the lies, hypocrisy, and failed far-l...","b'Joe Biden falsely claims his ""infrastructure...","Joe Biden falsely claims his ""infrastructure"" ..."
3,"Adversity does not test one's character, it RE...",b'@kingbuddah4 @CNBC Can we just pass an infra...,@kingbuddah4 @CNBC Can we just pass an infrast...
4,Semi retired. Public Health MD MPH MS Hobby--...,b'@jsolomonReports I would agree about Infrast...,@jsolomonReports I would agree about Infrastru...
...,...,...,...
16633,Biden/Harris2020💙💙politics is a zero sum game💙...,"b""RT @NBCNews: U.S. Senate overwhelmingly pass...",RT @NBCNews: U.S. Senate overwhelmingly passes...
16634,"Android Developer, Physics and AI Programmer, ...","b""@tedcruz Ted was the only other vote against...",@tedcruz Ted was the only other vote against t...
16635,Progressive from Sconnie #Resist #FightTheStat...,b'RT @mmpadellan: Republicans are against:\n- ...,RT @mmpadellan: Republicans are against:\n- Th...
16636,"I design fabric, fight evil & read dead people...","b""RT @dick_nixon: Ocasio-Cortez wants a $10 tr...",RT @dick_nixon: Ocasio-Cortez wants a $10 tril...


In [8]:
from bs4 import BeautifulSoup
# from nltk.tokenize import WordPunctTokenizer

# tok = WordPunctTokenizer()

# pat1 = r'@[A-Za-z0-9]+'
# pat2 = r'https?://[A-Za-z0-9./]+'
# combined_pat = r'|'.join((pat1, pat2))

# Convert ascii codes and line breaks
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    return re.sub(r"\n", ' ', souped)

offline_tweets_df['text3'] = offline_tweets_df['text2'].map(lambda x: tweet_cleaner(x))
offline_tweets_df['user_descr'] = offline_tweets_df['user_description'].map(lambda x: tweet_cleaner(x))

display(offline_tweets_df[['text2','text3']][offline_tweets_df.text2!=offline_tweets_df.text3])


  ' that document to Beautiful Soup.' % decoded_markup
https://t.co/w842dlBY9A
https://t.co/jJVlm4Om4n" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


Unnamed: 0,text2,text3
0,Biden wants $6 trillion in new spending.\n\nWh...,Biden wants $6 trillion in new spending. Whic...
8,RT @neomano: @ProjectLincoln America First...b...,RT @neomano: @ProjectLincoln America First...b...
10,RT @ChrisLu44: Shouldn’t read too much into on...,RT @ChrisLu44: Shouldn’t read too much into on...
11,"welp, kennedy may be a no even if the bill is ...","welp, kennedy may be a no even if the bill is ..."
15,"@Bill_Tibbitts ""There are shovel-ready project...","@Bill_Tibbitts ""There are shovel-ready project..."
...,...,...
16626,RT @mmpadellan: Republicans are against:\n- Th...,RT @mmpadellan: Republicans are against: - The...
16629,RT @mmpadellan: Republicans are against:\n- Th...,RT @mmpadellan: Republicans are against: - The...
16634,@tedcruz Ted was the only other vote against t...,@tedcruz Ted was the only other vote against t...
16635,RT @mmpadellan: Republicans are against:\n- Th...,RT @mmpadellan: Republicans are against: - The...


## Parsing Mentions, Retweets, Hastags, and Links

In [9]:
# Functions to identify retweets, mentions, hashtags, and links

def find_retweeted(tweet):
  '''This function will extract the twitter handles of retweed people'''
  return re.findall('(?<=RT\s)(@[A-Za-z]+[A-Za-z0-9-_]+)', tweet)

def find_mentioned(tweet):
  '''This function will extract the twitter handles of people mentioned in the tweet'''
  return re.findall('(?<!RT\s)(@[A-Za-z]+[A-Za-z0-9-_]+)', tweet)  

def find_hashtags(tweet):
  '''This function will extract hashtags'''
  return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet)  

def find_links(tweet):
  '''This function will extract url links'''
  http_pattern = r'https?://[A-Za-z0-9./]+'
  bitly_pattern = r'bit.ly/\S+'
  pattern = r'|'.join((http_pattern, bitly_pattern))
  return re.findall(pattern, tweet)


In [10]:
# make new columns for retweeted usernames, mentioned usernames and hashtags

offline_tweets_df['is_retweet'] = offline_tweets_df['text3'].apply(lambda x: x[:2]=='RT')
offline_tweets_df['retweeted_from'] = offline_tweets_df.text3.apply(find_retweeted)
offline_tweets_df['mentioned'] = offline_tweets_df.text3.apply(find_mentioned)
offline_tweets_df['hashtags'] = offline_tweets_df.text3.apply(find_hashtags)
offline_tweets_df['links'] = offline_tweets_df.text3.apply(find_links)

display(offline_tweets_df[['is_retweet','retweeted_from','mentioned','hashtags','links']].head(10))


Unnamed: 0,is_retweet,retweeted_from,mentioned,hashtags,links
0,False,[],[],[],[https://t.co/e1jrD5n6wK]
1,False,[],[],[],[https://t.co/CT4yDoUoNw]
2,False,[],[],[],[https://t.co/YTyiQeKOLM]
3,False,[],"[@kingbuddah4, @CNBC]",[],[]
4,False,[],[@jsolomonReports],[],[]
5,False,[],[],[],[https://t.co/rf4N5CEDlC]
6,True,[@SusanStJames3_],[],[],[]
7,True,[@Charlen60403930],[],[],[]
8,True,[@neomano],[@ProjectLincoln],[],[]
9,False,[],[],[],[https://t.co/BN3ZWT4qBZ]


#Substitutions
Substitute http for links and @user for mentions in tweet text

In [11]:
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'

def preprocess(text, pat1repl='@user', pat2repl='http'):
    stripped = re.sub(pat1, pat1repl, text)
    stripped = re.sub(pat2, pat2repl, stripped)
    return stripped

text = offline_tweets_df['text2'].map(lambda x: preprocess(x))


# Save Prepared Dataframe

In [12]:
offline_tweets_df.to_pickle('/content/drive/MyDrive/Piper Gradient/Not-So-Twitterpated/cleaned_offline_tweets_df_large.pickle')

# Coding References

* [Topic Extraction from Tweets using LDA, by *Usen Osas*](https://medium.com/@osas.usen/topic-extraction-from-tweets-using-lda-a997e4eb0985)

* [Topic Modelling in Python: Unsupervised Machine Learning to Find Tweet Topics](https://ourcodingclub.github.io/tutorials/topic-modelling-python/)