In [None]:
import pandas as pd
import re
import nltk
nltk.download ("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
!gdown --id 1wXG8OxZ5rtz2OUJRp2GdzAZzPKHgv3Ig

Downloading...
From: https://drive.google.com/uc?id=1wXG8OxZ5rtz2OUJRp2GdzAZzPKHgv3Ig
To: /content/Coachella2015.csv
  0% 0.00/656k [00:00<?, ?B/s]100% 656k/656k [00:00<00:00, 86.7MB/s]


In [None]:
df = pd.read_csv("Coachella2015.csv", encoding = "latin1")
df.head()

Unnamed: 0,coachella_sentiment,coachella_yn,name,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,positive,yes,kokombil,0,#Coachella2015 tickets selling out in less tha...,"[0.0, 0.0]",1/7/15 15:02,5.52963e+17,,Quito
1,positive,yes,MisssTaraaa10,2,RT @sudsybuddy: WAIT THIS IS ABSOLUTE FIRE _ÙÓ...,,1/7/15 15:02,5.52963e+17,united states,
2,positive,yes,NMcCracken805,0,#Coachella2015 #VIP passes secured! See you th...,,1/7/15 15:01,5.52963e+17,"Costa Mesa, CA",
3,positive,yes,wxpnfm,1,PhillyÛªs @warondrugsjams will play #Coachell...,,1/7/15 15:01,5.52963e+17,"Philadelphia, PA and Worldwide",Quito
4,positive,yes,Caesears,0,If briana and her mom out to #Coachella2015 i...,,1/7/15 15:00,5.52963e+17,,


In [None]:
df.count()

coachella_sentiment    3846
coachella_yn           3846
name                   3846
retweet_count          3846
text                   3846
tweet_coord             242
tweet_created          3846
tweet_id               3846
tweet_location         2676
user_timezone          2863
dtype: int64

## 1. Extract hashtags

In [None]:
def hashtag(tweet):
  hash = re.findall(r"#(\w+)", tweet)
  return hash

In [None]:
# Testing on a sample
sampletweet = "#gameofthrones is one of the best TV shows ever! #GOT #khaleesi"
hashtag(sampletweet)

['gameofthrones', 'GOT', 'khaleesi']

## 2. Remove Retweet Username & Tweeted at @username

In [None]:
def remove_username(tweet):
  clean_tweet = re.sub(r"@[A-Za-z]+[A-Za-z0-9-_]+","", tweet)
  return clean_tweet

In [None]:
# Testing on a sample
sampletweet2 = "#gameofthrones is one of the best TV shows ever! #GOT #khaleesi @username @username2"
remove_username(sampletweet2)

'#gameofthrones is one of the best TV shows ever! #GOT #khaleesi  '

## 3. Remove links

In [None]:
def remove_links(tweet):
  clean_tweet = re.sub(r"http\S+", ' ', tweet)
  return clean_tweet

In [None]:
# Testing on a sample
sampletweet3 = "#gameofthrones is one of the best TV shows ever! #GOT #khaleesi https://got.com/"
remove_links(sampletweet3)

'#gameofthrones is one of the best TV shows ever! #GOT #khaleesi  '

In [None]:
def strip_link(tweet):
  clean_tweet = tweet.strip('[link]') # remove [links]
  return clean_tweet

In [None]:
# Testing on a sample
sampletweet4 = "#gameofthrones is one of the best TV shows ever! #GOT #khaleesi https://got.com/"
strip_link(sampletweet4)

'#gameofthrones is one of the best TV shows ever! #GOT #khaleesi https://got.com/'

## 4. Remove non-ASCII characters

In [None]:
def remove_non_ascii(tweet):
  return "".join(char for char in tweet if ord(char)<128)

# The ord() accepts a string of length 1 as an argument and returns the unicode point representation of the passed argument. 
# For example, ord("B") returns 66 which is a Unicode point value of the character "B". 
# the first 128 Unicode point valuses are the same as ASCII.

# Do not use this function if translating from one language to another.

In [None]:
#Testing on a sample
sampletweet5 = "Greek letters are so much fun Α α, Β β, Γ γ, Δ δ, Ε ε, Ζ ζ, Η η, Θ θ, Ι ι, Κ κ, Λ λ, Μ μ, Ν ν, Ξ ξ, Ο ο, Π π, Ρ ρ, Σ σ/ς, Τ τ, Υ υ, Φ φ, Χ χ, Ψ ψ, and Ω ω."
remove_non_ascii(sampletweet5)

'Greek letters are so much fun  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  /,  ,  ,  ,  ,  , and  .'

## 5. Change to lower case

In [None]:
def lower(text):
  return text.lower()

In [None]:
#Testing on a sample
sampletweet6 = "I AM #HUNGRY"
lower(sampletweet6)

'i am #hungry'

## 6. Remove stop words

In [None]:
def remove_stop_words(tweet):
  stops = set(stopwords.words("english"))
  #stops.update(("ur", "I"))
  new_tweet = " ".join([word for word in tweet.split()if word not in stops])
  return new_tweet

In [None]:
#testing on a sample
sampletweet7 = "#gameofthrones is one of the best TV shows ever! I love #GOT i love #khaleesi ur awesome"
remove_stop_words(sampletweet7)

'#gameofthrones one best TV shows ever! I love #GOT love #khaleesi ur awesome'

## 7. Remove email addresses using Regex

In [None]:
def remove_email_address(tweet):
  email = re.compile(r"[\w+\.-]+@[\w+\.-]+") #my_email0000@gmail.com
  return email.sub(r" ", tweet)


In [None]:
#testing a sample
sampletweet8 = "contact customer service #contact contact_1234@hotmail.com"
remove_email_address(sampletweet8)

'contact customer service #contact  '

## 8. Remove punctuation using RegexpTokenizer



In [None]:
def remove_punct(tweet):
  text = RegexpTokenizer(r"\w+")
  tweet = text.tokenize(tweet)
  tweet = " ".join(tweet)
  return tweet

In [None]:
#testing on a sample
sampletweet9 = "we're goin' to a party. Are u comin'?"
remove_punct(sampletweet9)

'we re goin to a party Are u comin'

## 9. Remove digits & special charcaters


In [None]:
def remove_digits_splchars(tweet):
  pattern = r"[^a-zA-Z.,!?/:;\"\'\s]"
  return re.sub(pattern, " ", tweet)


In [None]:
#testing on a sample
sampletweet10 = "***number 5 and 7 are awesome and they cost only $3!!!"
remove_digits_splchars(sampletweet10)

'   number   and   are awesome and they cost only   !!!'

## 10. Remove special characters (keep digits)

In [None]:
def remove_splchar(tweet):
  pattern = r"[^a-zA-Z0-9.,!?/:;\"\'\s]"
  return re.sub(pattern, " ", tweet)

In [None]:
#testing on a sample
sampletweet10 = "***number 5 and 7 are awesome and they cost only $3!!!"
remove_splchar(sampletweet10)

'   number 5 and 7 are awesome and they cost only  3!!!'

## Apply pre-processing functions to dataset


In [None]:
#df.head()

In [None]:
df["hashtags"] = df.text.apply(func = hashtag)

In [None]:
#df.head()

In [None]:
df["clean_tweet"] = df.text.apply(func= remove_username )
df["clean_tweet"] = df.clean_tweet.apply(func= remove_links) 
df["clean_tweet"] = df.clean_tweet.apply(func= remove_non_ascii)
df["clean_tweet"] = df.clean_tweet.apply(func= remove_stop_words) 
df["clean_tweet"] = df.clean_tweet.apply(func= lower) 
df["clean_tweet"] = df.clean_tweet.apply(func= remove_email_address) 
df["clean_tweet"] = df.clean_tweet.apply(func= remove_punct) 
df["clean_tweet"] = df.clean_tweet.apply(func= remove_digits_splchars) 


In [None]:
df["text"][1000] #testing on one instance

'#Coachella2015 #DiscountTickets #OnDeck #PLUR #Drake #Kaskade 626-600-NESS (6377) "@coachella: For those about to #RT http://t.co/0IfmRrFGG5'

In [None]:
df["clean_tweet"][1000] #testing on one instance

'coachella     discounttickets ondeck plur drake kaskade         ness      for rt'

In [None]:
df.head()

Unnamed: 0,coachella_sentiment,coachella_yn,name,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone,hashtags,clean_tweet
0,positive,yes,kokombil,0,#Coachella2015 tickets selling out in less tha...,"[0.0, 0.0]",1/7/15 15:02,5.52963e+17,,Quito,[Coachella2015],coachella tickets selling less minutes ...
1,positive,yes,MisssTaraaa10,2,RT @sudsybuddy: WAIT THIS IS ABSOLUTE FIRE _ÙÓ...,,1/7/15 15:02,5.52963e+17,united states,,[Coachella2015],rt wait this is absolute fire coachella
2,positive,yes,NMcCracken805,0,#Coachella2015 #VIP passes secured! See you th...,,1/7/15 15:01,5.52963e+17,"Costa Mesa, CA",,"[Coachella2015, VIP]",coachella vip passes secured see bitchesssss
3,positive,yes,wxpnfm,1,PhillyÛªs @warondrugsjams will play #Coachell...,,1/7/15 15:01,5.52963e+17,"Philadelphia, PA and Worldwide",Quito,"[Coachella2015, GovBall2015]",phillys play coachella amp govball wat...
4,positive,yes,Caesears,0,If briana and her mom out to #Coachella2015 i...,,1/7/15 15:00,5.52963e+17,,,[Coachella2015],if briana mom coachella im
