### Import Library 

In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
import re
import random

### Data 

In [15]:
data = pd.read_csv("data_clean.csv") #memanggil data
data.head() #menampilkan 5 baris teratas data

Unnamed: 0,No,user_name,text,label,lowercase,remove,tweet_clean
0,1,Arnold Mamesah MA,Fobia #COVID19 muncul bersamaan dgn #depresi #...,positif,fobia #covid19 muncul bersamaan dgn #depresi #...,fobia muncul bersamaan dgn dlm berbagai akib,"['fobia', 'muncul', 'bersamaan', 'dgn', 'dlm',..."
1,2,Himmah Online,[Berita]\nProgram vaksinasi masih menjadi kont...,positif,[berita]\nprogram vaksinasi masih menjadi kont...,[berita]program vaksinasi masih menjadi kontro...,"['berit', 'program', 'vaksinasi', 'kontroversi..."
2,3,Inaz Putri,"jangan sampai kecolongan, yuk kita terus menja...",positif,"jangan sampai kecolongan, yuk kita terus menja...",jangan sampai kecolongan yuk kita terus menjag...,"['kecolongan', 'yuk', 'menjaga', 'kesehatan', ..."
3,4,Dicky Syaiful Anwar,Ide sih untuk yg masih menolak di vaksin. Demi...,positif,ide sih untuk yg masih menolak di vaksin. demi...,ide sih untuk yg masih menolak di vaksin demi ...,"['ide', 'sih', 'yg', 'menolak', 'vaksin', 'keb..."
4,5,Karya Untuk Indonesia,"Halo Sobat Kui, vaksinasi bisa berpengaruh unt...",positif,"halo sobat kui, vaksinasi bisa berpengaruh unt...",halo sobat kui vaksinasi bisa berpengaruh untu...,"['halo', 'sobat', 'kui', 'vaksinasi', 'berpeng..."


In [16]:
#Menentukan index data
def comments(index):
    example = data[data.index == index][['tweet_clean', 'user_name', 'label']].values[0]
    if len(example) > 0:
        print(example[0])
        print('User Name:', example[1])
        print('Label:', example[2])
        
comments(1)

['berit', 'program', 'vaksinasi', 'kontroversial', 'khalayak', 'terkait', 'pengaruh', 'efikasi', 'kemanjuran']
User Name: Himmah Online
Label: positif


### Cleaning Data 

In [17]:
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z #+_]')
stopworda = set(stopwords.words('indonesian'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = clean_spcl.sub(' ', text)
    text = clean_symbol.sub('', text)
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.encode('ascii', 'replace').decode('ascii')
    text = ' '.join(word for word in text.split() if word not in stopworda) # hapus stopword dari kolom deskripsi   
    return text

# Buat kolom tambahan untuk data description yang telah dibersihkan   
data['text_clean'] = data['tweet_clean'].apply(clean_text)
data

Unnamed: 0,No,user_name,text,label,lowercase,remove,tweet_clean,text_clean
0,1,Arnold Mamesah MA,Fobia #COVID19 muncul bersamaan dgn #depresi #...,positif,fobia #covid19 muncul bersamaan dgn #depresi #...,fobia muncul bersamaan dgn dlm berbagai akib,"['fobia', 'muncul', 'bersamaan', 'dgn', 'dlm',...",fobia muncul bersamaan dgn dlm akib
1,2,Himmah Online,[Berita]\nProgram vaksinasi masih menjadi kont...,positif,[berita]\nprogram vaksinasi masih menjadi kont...,[berita]program vaksinasi masih menjadi kontro...,"['berit', 'program', 'vaksinasi', 'kontroversi...",berit program vaksinasi kontroversial khalayak...
2,3,Inaz Putri,"jangan sampai kecolongan, yuk kita terus menja...",positif,"jangan sampai kecolongan, yuk kita terus menja...",jangan sampai kecolongan yuk kita terus menjag...,"['kecolongan', 'yuk', 'menjaga', 'kesehatan', ...",kecolongan yuk menjaga kesehatan progaram vaks...
3,4,Dicky Syaiful Anwar,Ide sih untuk yg masih menolak di vaksin. Demi...,positif,ide sih untuk yg masih menolak di vaksin. demi...,ide sih untuk yg masih menolak di vaksin demi ...,"['ide', 'sih', 'yg', 'menolak', 'vaksin', 'keb...",ide sih yg menolak vaksin kebaikan besama ayo ...
4,5,Karya Untuk Indonesia,"Halo Sobat Kui, vaksinasi bisa berpengaruh unt...",positif,"halo sobat kui, vaksinasi bisa berpengaruh unt...",halo sobat kui vaksinasi bisa berpengaruh untu...,"['halo', 'sobat', 'kui', 'vaksinasi', 'berpeng...",halo sobat kui vaksinasi berpengaruh pemulihan...
...,...,...,...,...,...,...,...,...
1268,4622,akusimus',Sekarang uni eropa ketat mengontrol vaccine pf...,negatif,sekarang uni eropa ketat mengontrol vaccine pf...,sekarang uni eropa ketat mengontrol vaccine pf...,"['uni', 'eropa', 'ketat', 'mengontrol', 'vacci...",uni eropa ketat mengontrol vaccine pfizer ingg...
1269,4629,prasthink,Coroooonaaa #corona #covid19 #c19 #pusing #bog...,negatif,coroooonaaa #corona #covid19 #c19 #pusing #bog...,coroooonaaancovid,['corooonaaancovid'],corooonaaancovid
1270,4631,Fauzan Rahardian,"wtf, katanya ga bakal berubah? https://t.co/DF...",negatif,"wtf, katanya ga bakal berubah? https://t.co/df...",wtf katanya ga bakal berubah,"['wtf', 'ga', 'berubah']",wtf ga berubah
1271,4632,Discovery â˜¹ðŸ”«,Gaada tanadaÂ² jadi war hammer titan\n#vaksin ...,negatif,gaada tanadaâ² jadi war hammer titan\n#vaksin ...,gaada tanada? jadi war hammer titan,"['gaada', 'tanada', 'war', 'hammer', 'titan']",gaada tanada war hammer titan


In [18]:
data.isna() #melihat missing value

Unnamed: 0,No,user_name,text,label,lowercase,remove,tweet_clean,text_clean
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
1268,False,False,False,False,False,False,False,False
1269,False,False,False,False,False,False,False,False
1270,False,False,False,False,False,False,False,False
1271,False,False,False,False,False,False,False,False


In [19]:
data.notna()

Unnamed: 0,No,user_name,text,label,lowercase,remove,tweet_clean,text_clean
0,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...
1268,True,True,True,True,True,True,True,True
1269,True,True,True,True,True,True,True,True
1270,True,True,True,True,True,True,True,True
1271,True,True,True,True,True,True,True,True


In [20]:
data.isna().sum() #melihat jumlah misisng value

No             0
user_name      0
text           0
label          0
lowercase      0
remove         1
tweet_clean    0
text_clean     0
dtype: int64

In [21]:
#remove deplicate data dan missing value
data.sort_values("No", inplace = True, axis = 0)
data.dropna(axis = 0)
data.drop_duplicates(subset ="tweet_clean", keep = 'first', inplace = True) #remove duplicate data (tweet ganda)
data.drop(data[data.text_clean == ''].index, inplace=True)
data.drop(data[data.remove == ''].index, inplace=True)
data

Unnamed: 0,No,user_name,text,label,lowercase,remove,tweet_clean,text_clean
0,1,Arnold Mamesah MA,Fobia #COVID19 muncul bersamaan dgn #depresi #...,positif,fobia #covid19 muncul bersamaan dgn #depresi #...,fobia muncul bersamaan dgn dlm berbagai akib,"['fobia', 'muncul', 'bersamaan', 'dgn', 'dlm',...",fobia muncul bersamaan dgn dlm akib
1,2,Himmah Online,[Berita]\nProgram vaksinasi masih menjadi kont...,positif,[berita]\nprogram vaksinasi masih menjadi kont...,[berita]program vaksinasi masih menjadi kontro...,"['berit', 'program', 'vaksinasi', 'kontroversi...",berit program vaksinasi kontroversial khalayak...
2,3,Inaz Putri,"jangan sampai kecolongan, yuk kita terus menja...",positif,"jangan sampai kecolongan, yuk kita terus menja...",jangan sampai kecolongan yuk kita terus menjag...,"['kecolongan', 'yuk', 'menjaga', 'kesehatan', ...",kecolongan yuk menjaga kesehatan progaram vaks...
3,4,Dicky Syaiful Anwar,Ide sih untuk yg masih menolak di vaksin. Demi...,positif,ide sih untuk yg masih menolak di vaksin. demi...,ide sih untuk yg masih menolak di vaksin demi ...,"['ide', 'sih', 'yg', 'menolak', 'vaksin', 'keb...",ide sih yg menolak vaksin kebaikan besama ayo ...
4,5,Karya Untuk Indonesia,"Halo Sobat Kui, vaksinasi bisa berpengaruh unt...",positif,"halo sobat kui, vaksinasi bisa berpengaruh unt...",halo sobat kui vaksinasi bisa berpengaruh untu...,"['halo', 'sobat', 'kui', 'vaksinasi', 'berpeng...",halo sobat kui vaksinasi berpengaruh pemulihan...
...,...,...,...,...,...,...,...,...
1268,4622,akusimus',Sekarang uni eropa ketat mengontrol vaccine pf...,negatif,sekarang uni eropa ketat mengontrol vaccine pf...,sekarang uni eropa ketat mengontrol vaccine pf...,"['uni', 'eropa', 'ketat', 'mengontrol', 'vacci...",uni eropa ketat mengontrol vaccine pfizer ingg...
1269,4629,prasthink,Coroooonaaa #corona #covid19 #c19 #pusing #bog...,negatif,coroooonaaa #corona #covid19 #c19 #pusing #bog...,coroooonaaancovid,['corooonaaancovid'],corooonaaancovid
1270,4631,Fauzan Rahardian,"wtf, katanya ga bakal berubah? https://t.co/DF...",negatif,"wtf, katanya ga bakal berubah? https://t.co/df...",wtf katanya ga bakal berubah,"['wtf', 'ga', 'berubah']",wtf ga berubah
1271,4632,Discovery â˜¹ðŸ”«,Gaada tanadaÂ² jadi war hammer titan\n#vaksin ...,negatif,gaada tanadaâ² jadi war hammer titan\n#vaksin ...,gaada tanada? jadi war hammer titan,"['gaada', 'tanada', 'war', 'hammer', 'titan']",gaada tanada war hammer titan


In [22]:
data.isna().sum() #melihat jumlah misisng value

No             0
user_name      0
text           0
label          0
lowercase      0
remove         0
tweet_clean    0
text_clean     0
dtype: int64

### Convert Data to CSV

In [24]:
data.to_csv('data_clean2.csv', index=False) #convert data yang telah di cleaning ke dalam bentuk file csv
data_clean2= pd.read_csv('data_clean2.csv', encoding = 'latin1')
data_clean2

Unnamed: 0,No,user_name,text,label,lowercase,remove,tweet_clean,text_clean
0,1,Arnold Mamesah MA,Fobia #COVID19 muncul bersamaan dgn #depresi #...,positif,fobia #covid19 muncul bersamaan dgn #depresi #...,fobia muncul bersamaan dgn dlm berbagai akib,"['fobia', 'muncul', 'bersamaan', 'dgn', 'dlm',...",fobia muncul bersamaan dgn dlm akib
1,2,Himmah Online,[Berita]\nProgram vaksinasi masih menjadi kont...,positif,[berita]\nprogram vaksinasi masih menjadi kont...,[berita]program vaksinasi masih menjadi kontro...,"['berit', 'program', 'vaksinasi', 'kontroversi...",berit program vaksinasi kontroversial khalayak...
2,3,Inaz Putri,"jangan sampai kecolongan, yuk kita terus menja...",positif,"jangan sampai kecolongan, yuk kita terus menja...",jangan sampai kecolongan yuk kita terus menjag...,"['kecolongan', 'yuk', 'menjaga', 'kesehatan', ...",kecolongan yuk menjaga kesehatan progaram vaks...
3,4,Dicky Syaiful Anwar,Ide sih untuk yg masih menolak di vaksin. Demi...,positif,ide sih untuk yg masih menolak di vaksin. demi...,ide sih untuk yg masih menolak di vaksin demi ...,"['ide', 'sih', 'yg', 'menolak', 'vaksin', 'keb...",ide sih yg menolak vaksin kebaikan besama ayo ...
4,5,Karya Untuk Indonesia,"Halo Sobat Kui, vaksinasi bisa berpengaruh unt...",positif,"halo sobat kui, vaksinasi bisa berpengaruh unt...",halo sobat kui vaksinasi bisa berpengaruh untu...,"['halo', 'sobat', 'kui', 'vaksinasi', 'berpeng...",halo sobat kui vaksinasi berpengaruh pemulihan...
...,...,...,...,...,...,...,...,...
1247,4622,akusimus',Sekarang uni eropa ketat mengontrol vaccine pf...,negatif,sekarang uni eropa ketat mengontrol vaccine pf...,sekarang uni eropa ketat mengontrol vaccine pf...,"['uni', 'eropa', 'ketat', 'mengontrol', 'vacci...",uni eropa ketat mengontrol vaccine pfizer ingg...
1248,4629,prasthink,Coroooonaaa #corona #covid19 #c19 #pusing #bog...,negatif,coroooonaaa #corona #covid19 #c19 #pusing #bog...,coroooonaaancovid,['corooonaaancovid'],corooonaaancovid
1249,4631,Fauzan Rahardian,"wtf, katanya ga bakal berubah? https://t.co/DF...",negatif,"wtf, katanya ga bakal berubah? https://t.co/df...",wtf katanya ga bakal berubah,"['wtf', 'ga', 'berubah']",wtf ga berubah
1250,4632,Discovery Ã¢ËÂ¹Ã°Å¸âÂ«,Gaada tanadaÃÂ² jadi war hammer titan\n#vaksi...,negatif,gaada tanadaÃ¢Â² jadi war hammer titan\n#vaksi...,gaada tanada? jadi war hammer titan,"['gaada', 'tanada', 'war', 'hammer', 'titan']",gaada tanada war hammer titan


### TFIDF 

In [25]:
data_clean2.head()
data_clean2.set_index('user_name', inplace=True)
tf = TfidfVectorizer(analyzer='word', min_df=0)
tfidf_matrix = tf.fit_transform(data['text_clean'])
#cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
#cos_sim

terms = tf.get_feature_names()
print ("TERMS: \n", terms)

data_clean2 = pd.DataFrame(tfidf_matrix.T.todense(), index=terms) 
#df.sort_values(by=["tfidf"],ascending=False)
data_clean2.head()


TERMS: 
 ['aa', 'aamiin', 'aanu', 'aarbpn', 'aasudahlah', 'abang', 'abangalovera', 'abdul', 'abis', 'abjad', 'absurd', 'acara', 'aceh', 'acehbersamaorang', 'ackerman', 'adadikasih', 'adakah', 'adal', 'adalh', 'adattokoh', 'addi', 'adik', 'adlah', 'adlh', 'admin', 'aduan', 'aebelum', 'afiat', 'afrika', 'agam', 'agama', 'agenda', 'agung', 'agvi', 'ah', 'ahli', 'ahlinya', 'ahmad', 'ahmadsama', 'ahmatd', 'aing', 'aiptu', 'air', 'airlangga', 'aja', 'ajah', 'ajak', 'ajakan', 'aje', 'ajig', 'ak', 'akal', 'akar', 'akhirakhir', 'akhirnyapaket', 'akib', 'akibat', 'akibatkankematiannonton', 'akibatnya', 'akp', 'akselerasi', 'aktivis', 'akun', 'akurasi', 'akurasinya', 'al', 'alam', 'alami', 'alasan', 'alasannya', 'album', 'aleg', 'alerta', 'alhamdulilah', 'alhamdulillah', 'alih', 'allah', 'allaminpak', 'alumni', 'alur', 'alutsista', 'am', 'ama', 'aman', 'amanat', 'amancakaplahberpikir', 'amataslipaling', 'ambil', 'amik', 'amp', 'ampe', 'ampuh', 'ampun', 'amsakar', 'an', 'ana', 'anak', 'anakanak', 



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1242,1243,1244,1245,1246,1247,1248,1249,1250,1251
aa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aamiin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aanu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aarbpn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aasudahlah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
data_clean2.to_csv('data_tfidf.csv', index=False) #convert data hasil tfidf ke dalam bentuk file csv
data_tfidf= pd.read_csv('data_tfidf.csv', encoding = 'latin1')
data_tfidf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1242,1243,1244,1245,1246,1247,1248,1249,1250,1251
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.305372,0.0,0.0,0.0,0.0
3732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
