In [155]:
pip install --upgrade demoji

Note: you may need to restart the kernel to use updated packages.


In [218]:
import re
import pandas as pd
from flask import request
from flasgger import Swagger, LazyString, LazyJSONEncoder
from flasgger import swag_from
from flask import Flask, jsonify
import sqlite3
import demoji
import emoji
from unidecode import unidecode



In [157]:
df_abussive = pd.read_csv('asset-challenges/abusive.csv')
df_abussive

Unnamed: 0,ABUSIVE
0,alay
1,ampas
2,buta
3,keparat
4,anjing
...,...
120,rezim
121,sange
122,serbet
123,sipit


In [158]:
df_kamusalay = pd.read_csv('asset-challenges/new_kamusalay.csv', header=None, names=['kata_alay', 'kata_baku'],encoding='latin-1')
df_kamusalay

Unnamed: 0,kata_alay,kata_baku
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali
...,...,...
15162,mendikbud,menteri pendidikan dan kebudayaan
15163,mendag,menteri perdagangan
15164,menaker,menteri tenaga kerja
15165,memetwit,mentwit


In [222]:
df_data = pd.read_csv('asset-challenges/data.csv',encoding='utf-8')
df_data.head(2)

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,- disaat semua cowok berusaha melacak perhatia...,1,1,1,0,0,0,0,0,1,1,0,0
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,0,0,0,0,0,0,0,0,0,0


Fungsi-fungsi

In [160]:
def replace_kata_alay(text, df_kamusalay):
    words = text.split()
    for i in range(len(words)):
        if words[i] in df_kamusalay['kata_alay'].values:
            new_word = df_kamusalay.loc[df_kamusalay['kata_alay'] == words[i], 'kata_baku'].values[0]
            words[i] = new_word
    return ' '.join(words)

In [161]:
def insert_data(text, cleaned_text):
    try: 
        conn_f = sqlite3.connect('asset-challenges/input_kalimat.db')       
        cursor_f = conn_f.cursor()    
        cursor_f.execute("INSERT INTO data_kalimat (text, cleaned_text) VALUES (?, ?)", (text, cleaned_text))
        conn_f.commit()
    except Exception as e:
        print(f"Error insert: {str(e)}")
    finally:
        conn_f.close()

In [254]:
def insert_data_file(text, cleaned_text, hs, abusive, hs_individual, hs_group, hs_religion, hs_race, hs_physical, hs_gender, hs_other, hs_weak, hs_moderate, hs_strong):
    try:
        conn_f = sqlite3.connect('asset-challenges/input_file.db')
        cursor_f = conn_f.cursor()
        
        cursor_f.execute("""
            INSERT INTO data_kalimat_file (
                text, cleaned_text, hs, abusive, hs_individual, 
                hs_group, hs_religion, hs_race, hs_physical, hs_gender, 
                hs_other, hs_weak, hs_moderate, hs_strong
            ) 
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (text, cleaned_text, hs, abusive, hs_individual, 
              hs_group, hs_religion, hs_race, hs_physical, hs_gender, 
              hs_other, hs_weak, hs_moderate, hs_strong))
              
        conn_f.commit()
    except Exception as e:
        print(f"Error insert: {str(e)}")
    finally:
        conn_f.close()

In [108]:
def remove_em(text):
    dem = demoji.findall(text)
    for item in dem.keys():
        text = text.replace(item, '')
    return text


main code

In [255]:
app = Flask(__name__)

app.json_encoder = LazyJSONEncoder
swagger_template = dict(
info = {
    'title': LazyString(lambda: 'API Documentation for Data Processing and Modeling'),
    'version': LazyString(lambda: '1.0.0'),
    'description': LazyString(lambda: 'Dokumentasi API untuk Data Processing dan Modeling'),
    },
    host = LazyString(lambda: request.host)
)

swagger_config = {
    "headers": [],
    "specs": [
        {
            "endpoint": 'api',
            "route": '/api.json',
        }
    ],
    "static_url_path": "/flasgger_static",
    "swagger_ui": True,
    "specs_route": "/api/"
}


swagger = Swagger(app, template=swagger_template,             
                  config=swagger_config)


# input kalimat
@swag_from("C://Users/akung/api/text_processing.yml", methods=['POST'])
@app.route('/text-processing', methods=['POST'])
def text_processing():

    text = request.form.get('text')
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    cleaned_text = re.sub(' +', ' ', cleaned_text).strip()
    cleaned_text = replace_kata_alay(cleaned_text, df_kamusalay)
    
#     conn.execute('''INSERT INTO data_kalimat (text, cleaned_text) VALUES (?, ?) ''', (text, cleaned_text))
#     conn.commit()
    insert_data(text, cleaned_text)

    
    json_response = {
        'status_code': 200,
        'description': "Teks yang sudah diproses",
        'data': cleaned_text,
    }

    response_data = jsonify(json_response)
    return response_data


#input file
@swag_from("C://Users/akung/api/text_processing_file.yml", methods=['POST'])
@app.route('/text-processing-file', methods=['POST'])
def text_processing_file():

    file = request.files.getlist('file')[0]
    df_file = pd.read_csv(file,encoding='utf-8')
    
    texts_file = df_file['Tweet'].to_list()
    
    hs_file = df_file['HS'].to_list()
    abusive_file = df_file['Abusive'].to_list()
    hs_individual_file = df_file['HS_Individual'].to_list()
    hs_group_file = df_file['HS_Group'].to_list()
    hs_religion_file = df_file['HS_Religion'].to_list()
    
    hs_race_file = df_file['HS_Race'].to_list()
    hs_physical_file = df_file['HS_Physical'].to_list()
    hs_gender_file = df_file['HS_Gender'].to_list()
    hs_other_file = df_file['HS_Other'].to_list()
    hs_weak_file = df_file['HS_Weak'].to_list()
    
    hs_moderate_file = df_file['HS_Moderate'].to_list()
    hs_strong_file = df_file['HS_Strong'].to_list()    
    
    
    
    
    cleaned_text = []
    for text in texts_file:
        cleaned_text.append(re.sub(r'[^a-zA-Z0-9\s]', ' ', text))
#         cleaned_text.append(re.sub(r'[^a-zA-Z0-9\s]', ' ', str(text)))   
#     cleaned_text_new = []
#     for text in cleaned_text:
#         cleaned_text_new.append(replace_kata_alay(text, df_kamusalay))
    
    cleaned_text_alay = [replace_kata_alay(text, df_kamusalay) for text in cleaned_text]
    tanpa_kata_user = [re.sub(r'\bUSER\b', '',text) for text in cleaned_text_alay]
    cleaned_text_RT = [re.sub(r'\bRT\b', '',text) for text in tanpa_kata_user]
    cleaned_text_new = [re.sub(r'\s+', ' ',text ).strip() for text in cleaned_text_RT ]
    
    for i in range(len(texts_file)):
        insert_data_file(texts_file[i], cleaned_text_new[i], hs_file[i], abusive_file[i], hs_individual_file[i], 
                         hs_group_file[i], hs_religion_file[i], hs_race_file[i], hs_physical_file[i], hs_gender_file[i], 
                         hs_other_file[i], hs_weak_file[i], hs_moderate_file[i], hs_strong_file[i])
    
    json_response = {
        'status_code': 200,
        'description': "Teks yang sudah diproses",
        'data': cleaned_text_new,
    }
    response_data = jsonify(json_response)
    return response_data

if __name__ == '__main__':
   app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [30/Nov/2023 19:54:35] "POST /text-processing-file HTTP/1.1" 200 -


# olah-olah

# olah-olah

membuat table untuk input kalimat

In [26]:
conn = sqlite3.connect('asset-challenges/input_kalimat.db')
cursor = conn.cursor()
cursor.execute("DROP TABLE data_kalimat")
cursor.close()
conn.close()

<sqlite3.Cursor at 0x21cacc87f80>

In [27]:
conn = sqlite3.connect('asset-challenges/input_kalimat.db')
cursor = conn.cursor()
cursor.execute('''CREATE TABLE data_kalimat(id INTEGER PRIMARY KEY AUTOINCREMENT, text TEXT, cleaned_text TEXT)''')
print('Sukses buat table')
cursor.close()
conn.close()

Sukses


In [29]:
conn = sqlite3.connect('asset-challenges/input_kalimat.db')
cursor = conn.cursor()
#test insert
cursor.execute("INSERT INTO data_kalimat (text, cleaned_text) VALUES (?, ?)", ('test kalimat', 'test kalimat bersih'))
conn.commit()
print('sukses')
cursor.close()
conn.close()

sukses


In [72]:
conn = sqlite3.connect('asset-challenges/input_kalimat.db')
cursor = conn.cursor()
#cek isi data
cursor.execute("SELECT * FROM data_kalimat")
hasil = cursor.fetchall()

print(hasil)
cursor.close()
conn.close()

[(1, 'test kalimat', 'test kalimat bersih'), (2, 'coba', 'bersih'), (3, 'cccc', 'cccc'), (4, 'ini pakcikdahtua 😊😊😒❤️', 'ini pak cik sudah tua'), (5, '🐗🐗🐨🐻 🛒🦺🧥 jam⏲️⌚⌛', 'jam'), (6, ' 📢 ', '')]


Membuat table untuk input file

In [248]:
conn = sqlite3.connect('asset-challenges/input_file.db')
cursor = conn.cursor()
cursor.execute("DROP TABLE data_kalimat_file")
conn.commit()
cursor.close()
conn.close()

In [249]:
conn = sqlite3.connect('asset-challenges/input_file.db')
cursor = conn.cursor()

cursor.execute('''
    CREATE TABLE data_kalimat_file (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        text TEXT,
        cleaned_text TEXT,
        
        hs INTEGER,
        abusive INTEGER,
        hs_individual INTEGER,
        
        hs_group INTEGER,
        hs_religion INTEGER,
        hs_race INTEGER,
        hs_physical INTEGER,
        hs_gender INTEGER,
        hs_other INTEGER,
        hs_weak INTEGER,
        hs_moderate INTEGER,
        hs_strong INTEGER
    );
''')

print('Sukses')
conn.commit()
cursor.close()
conn.close()

Sukses


In [253]:
conn = sqlite3.connect('asset-challenges/input_file.db')
cursor = conn.cursor()
#cek isi data
cursor.execute("SELECT * FROM data_kalimat_file LIMIT 1")
hasil = cursor.fetchall()
print(hasil)
cursor.close()
conn.close()

[]


In [204]:
df_hasil = pd.DataFrame(hasil)


In [37]:
conn = sqlite3.connect('asset-challenges/input_file.db')
cursor = conn.cursor()
#cek isi data
cursor.execute("DELETE FROM data_kalimat_file")
conn.commit()
cursor.close()
conn.close()

In [13]:
text = "coba 2   😒😒😍😍 P)(&%$#@)"
bersih = "bersih 2"
insert_data_file(text, bersih)

In [115]:
text= b'Hello! \xf0\x9f\x93\xa2 End Climate Silence is looking for volunteers! \n\n1-2 hours per week. \xf0\x9f\x95\x91\n\nExperience doing digital research\xe2\x80\xa6 \xf0\x9f\x98\x84\xf0\x9f\x98\x84\xf0\x9f\x98\x84  USER Slga mnr bom \xf0\x9f\x98\x82'
# text.decode("utf8")
print('ini teks\n',text)


print('ini decode\n',text_decode)

ini teks
 b'Hello! \xf0\x9f\x93\xa2 End Climate Silence is looking for volunteers! \n\n1-2 hours per week. \xf0\x9f\x95\x91\n\nExperience doing digital research\xe2\x80\xa6 \xf0\x9f\x98\x84\xf0\x9f\x98\x84\xf0\x9f\x98\x84  USER Slga mnr bom \xf0\x9f\x98\x82'
ini decode
 Hello! 📢 End Climate Silence is looking for volunteers! 

1-2 hours per week. 🕑

Experience doing digital research… 😄😄😄  USER Slga mnr bom 😂


In [142]:
data_baru = b'Ari sarua beki mah repeh monyet\xf0\x9f\x98\x86\xf0\x9f\x98\x86'
text_baru = data_baru.decode("utf-8")
print(text_baru)

Ari sarua beki mah repeh monyet😆😆


In [None]:
def remove_em(text):
    dem = demoji.findall(text)
    for item in dem.keys():
        text = text.replace(item, '')
    return text

In [189]:
df_data = pd.read_csv('C:/Users/akung/asset-challenges/archive/data.csv',  encoding='latin-1')
texts_file = df_data['Tweet'].to_list()

In [191]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
        u"\U0001F700-\U0001F77F"  # alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [None]:
df_data = pd.read_csv('asset-challenges/data.csv',encoding='latin-1')
df_data.to_csv('data.csv', index=False, encoding='utf-8')