# Text cleaning

1. Lowering
2. Removing hashtags & mentions
3. Removing url/link
4. Removing punctuations
5. Removing non-alpha numeric
6. Tokenization
7. Stop words removal

In [10]:
# contoh tweet
tweet = "Get ready for #NatGeoEarthDay! Join us on 4/21 for an evening of music and celebration, exploration and inspiration https://on.natgeo.com/3t0wzQy."

In [11]:
print(tweet)

Get ready for #NatGeoEarthDay! Join us on 4/21 for an evening of music and celebration, exploration and inspiration https://on.natgeo.com/3t0wzQy.


## 1. Lowering

In [12]:
temp = tweet.lower()
temp

'get ready for #natgeoearthday! join us on 4/21 for an evening of music and celebration, exploration and inspiration https://on.natgeo.com/3t0wzqy.'

## 2. Removing hashtags & mentions

In [13]:
import re

In [14]:
temp = re.sub("@[A-Za-z0-9_]+", "", temp)
temp = re.sub("#[A-Za-z0-9_]+", "", temp)
temp

'get ready for ! join us on 4/21 for an evening of music and celebration, exploration and inspiration https://on.natgeo.com/3t0wzqy.'

## 3. Removing url/link

In [15]:
temp = re.sub(r"http\S+", "", temp)
temp = re.sub(r"www.\S+", "", temp)
temp

'get ready for ! join us on 4/21 for an evening of music and celebration, exploration and inspiration '

## 4. Removing punctuations

In [16]:
temp = re.sub("[()!?]", "", temp)
temp = re.sub("\[.*?\]", "", temp)
temp

'get ready for  join us on 4/21 for an evening of music and celebration, exploration and inspiration '

## 5. Removing non-alpha numeric

In [17]:
temp = re.sub("[^a-z0-9]", " ", temp)
temp

'get ready for  join us on 4 21 for an evening of music and celebration  exploration and inspiration '

## 6. Tokenization

In [18]:
temp = temp.split()
temp

['get',
 'ready',
 'for',
 'join',
 'us',
 'on',
 '4',
 '21',
 'for',
 'an',
 'evening',
 'of',
 'music',
 'and',
 'celebration',
 'exploration',
 'and',
 'inspiration']

## 7. Stop words removal

In [30]:
stopwords = ["for", "on", "an", "a", "of", "and", "in", "the", "to", "from", "rt"]

In [31]:
temp = [w for w in temp if not w in stopwords]
temp = " ".join(word for word in temp)

In [21]:
temp

'get ready join us 4 21 evening music celebration exploration inspiration'

## cleaning function

In [22]:
import numpy as np
import re
 
def clean_tweet(tweet):
    if type(tweet) == np.float:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    temp = [w for w in temp if not w in stopwords]
    temp = " ".join(word for word in temp)
    return temp

In [23]:
tweet

'Get ready for #NatGeoEarthDay! Join us on 4/21 for an evening of music and celebration, exploration and inspiration https://on.natgeo.com/3t0wzQy.'

In [24]:
clean_tweet(tweet)

'get ready join us 4 21 evening music celebration exploration inspiration'

## Real dataset

In [25]:
import pandas as pd

In [26]:
df = pd.read_csv('result.csv', names=['id','user','datetime','text'])

In [27]:
df

Unnamed: 0,id,user,datetime,text
0,1451102531766136833,carew_mad,2021-10-21 08:26:18+00:00,RT @PeterSweden7: Canada🇨🇦: They are arresting...
1,1451102532021809154,smileyperson,2021-10-21 08:26:18+00:00,RT @YungYinkv: A young lady and I both walked ...
2,1451102533364031489,PndasPntH0use,2021-10-21 08:26:18+00:00,RT @welovejennalove: 🎉🎈Wow! 30k followers. Tha...
3,1451102533552787457,RTHhan20,2021-10-21 08:26:18+00:00,RT @raufsissay: Yes. She has a valid point. Bu...
4,1451102533804507140,AgencyWebcam,2021-10-21 08:26:18+00:00,RT @welovejennalove: 🎉🎈Wow! 30k followers. Tha...
5,1451102534626594816,savagegamer17,2021-10-21 08:26:19+00:00,RT @XAG_official: #RuralWomen play a vital rol...
6,1451102537700962307,cuxxxom,2021-10-21 08:26:19+00:00,"RT @KhaosodEnglish: Jay Fai, at last, we see e..."
7,1451102538242174979,seitther,2021-10-21 08:26:19+00:00,how the hell did i not get food poisining or f...
8,1451102539588440068,ray_rayalder,2021-10-21 08:26:20+00:00,RT @sammielee46: Still no ambulance for my 86 ...
9,1451102541895307266,Luckylove6469,2021-10-21 08:26:20+00:00,RT @harperjones69: Grab my hips and make me sc...


In [28]:
df.text

0     RT @PeterSweden7: Canada🇨🇦: They are arresting...
1     RT @YungYinkv: A young lady and I both walked ...
2     RT @welovejennalove: 🎉🎈Wow! 30k followers. Tha...
3     RT @raufsissay: Yes. She has a valid point. Bu...
4     RT @welovejennalove: 🎉🎈Wow! 30k followers. Tha...
5     RT @XAG_official: #RuralWomen play a vital rol...
6     RT @KhaosodEnglish: Jay Fai, at last, we see e...
7     how the hell did i not get food poisining or f...
8     RT @sammielee46: Still no ambulance for my 86 ...
9     RT @harperjones69: Grab my hips and make me sc...
10    RT @misscelebrian: Grab them. https://t.co/8a8...
11    RT @sammielee46: Still no ambulance for my 86 ...
12    That time when I was in BBC Good Food magazine...
13    Enough of condemnation, denunciation, you &amp...
14    RT @ZeroWisdom: Crazy:B order\n\nRinne : don't...
15    RT @Mahi_GA: No food. No medicine.\nNo banks. ...
16    RT @citizen_uganda: 𝗔𝗴𝗿𝗼-𝘀𝗲𝗰𝘂𝗿𝗶𝘁𝘆:This award w...
17    As I recall,I'm not good at making Japanes

In [32]:
df.text.apply(clean_tweet)

0     canada they are arresting pastors crime having...
1     young lady i both walked into establishment wi...
2     wow 30k followers thank you supporting this pa...
3     yes she has valid point but people also need s...
4     wow 30k followers thank you supporting this pa...
5     play vital role providing us with diverse suff...
6     jay fai at last we see each other plain contin...
7     how hell did i not get food poisining or felt ...
8     still no ambulance my 86 year old grandad he s...
9                           grab my hips make me scream
10                                            grab them
11    still no ambulance my 86 year old grandad he s...
12          that time when i was bbc good food magazine
13    enough condemnation denunciation you amp secur...
14    crazy b order rinne dont contact ur brother ni...
15    no food no medicine no banks no electricity no...
16    this award will value innovations agro process...
17    as i recall im not good at making japanese