-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_func.py
117 lines (91 loc) · 3.92 KB
/
clean_func.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pandas as pd
import numpy as np
import nltk
import string
import re
from exclude_words import stopwords_indonesia, emoticons
#import sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()
#tokenize
from nltk.tokenize import TweetTokenizer
def load_data():
data = pd.read_csv('data_test.csv', encoding = "ISO-8859-1")#ubah nama file sesuai dengan nama file
return data
def remove_pattern(input_txt, pattern):
r = re.findall(pattern, input_txt)
for i in r:
input_txt = re.sub(i, '', input_txt)
return input_txt
# def get_slang_index(slang):
# return list(slang_dict['anakjakartaasikasik']).index(slang)
# def write_list(a_list):
# with open("tweets.json", "w") as fp:
# json.dump(a_list, fp)
# for tweet in alnum_only_tweets:
# for word in tweet:
# if word in list(slang_dict['anakjakartaasikasik']):
# slang_index = get_slang_index(word)
# tweet[tweet.index(word)] = slang_dict['anak jakarta asyik asyik'][slang_index]
def clean_text(tweet):
# remove stock market tickers like $GE
tweet = re.sub(r'\$\w*', '', tweet)
# remove old style retweet text "RT"
tweet = re.sub(r'^RT[\s]+', '', tweet)
# remove hyperlinks
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
# remove hashtags
# only removing the hash # sign from the word
tweet = re.sub(r'#', '', tweet)
#remove coma
tweet = re.sub(r',','',tweet)
#remove angka
tweet = re.sub('[0-9]+', '', tweet)
# remove character 'x'
tweet = re.sub(r'\b[xX]\w+','',tweet)
tweet = re.sub(r'\b[xX]','',tweet)
tweet = re.sub(r'[^\w\s]', '', tweet)
return tweet
def clean_and_stem_text(tweet):
tweet = clean_text(tweet)
# tokenize tweets
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
tweet_tokens = tokenizer.tokenize(tweet)
tweets_clean = []
for word in tweet_tokens:
if (word not in stopwords_indonesia and # remove stopwords
word not in emoticons and # remove emoticons
word not in string.punctuation) and word != 'user': # remove punctuation
#tweets_clean.append(word)
stem_word = stemmer.stem(word) # stemming word
tweets_clean.append(stem_word)
tweets_clean = ' '.join(tweets_clean)
return tweets_clean
def clean_csv(path, filename):
df = pd.read_csv(f'{path}', encoding = "ISO-8859-1")
df = pd.DataFrame(df.iloc[:, 0])
# Function to remove username (starts with '@')
df['remove_user'] = np.vectorize(remove_pattern)(df, "@[\w]*")
# Remove numbers, symbols, links, and duplicates data)
df['remove_symbols'] = df['remove_user'].apply(lambda x: clean_text(x))
df.sort_values("remove_symbols", inplace = True)
df.drop_duplicates(subset ="remove_symbols", keep = 'first', inplace = True)
df.to_csv(f'downloads/{filename}',encoding='utf8', index=False)
def clean_and_stem_csv(path, filename):
# Store data to variable, and get only the first column
df = pd.read_csv(f'{path}', encoding = "ISO-8859-1")
df = pd.DataFrame(df.iloc[:, 0])
# Function to remove username (starts with '@')
df['remove_user'] = np.vectorize(remove_pattern)(df, "@[\w]*")
# Function to remove numbers, symbols, etc.)
df['remove_symbols'] = df['remove_user'].apply(lambda x: clean_and_stem_text(x))
df.sort_values("remove_symbols", inplace = True)
df.drop_duplicates(subset ="remove_symbols", keep = 'first', inplace = True)
# Remove links, save to new column
df['tweet_clean'] = df['remove_symbols'].apply(lambda x: clean_and_stem_text(x))
# Remove duplicates data
# df.drop_duplicates(keep = 'first', inplace = True)
df = df.drop(columns=['remove_user', 'remove_symbols'])
df.loc[df.astype(str).drop_duplicates().index]
df.to_csv(f'downloads/{filename}',encoding='utf8', index=False)