## Importing Required Modules

In [None]:
import lyricsgenius
import pandas as pd
import string
import hazm

## Defining The List of Rappers and Their Corresponding Persian Name

In [None]:
artists = [
    'Hichkas',
    'Zedbazi',
    'Sepehr Khalse',
    'Mehrad Hidden',
    'Saman Wilson',
    'Sohrab MJ',
    'Sijal',
    'Alireza JJ',
    'Ali Sorena',
    'Fadaei',
    'Ho3ein',
    'Quf',
    'Gdaal',
    'Erfan',
    'Taham',
    'Behzad Leito',
    'Hiphopologist',
    'Shayea',
    'Ali Owj',
    'Amirali A2',
    'Amir Tataloo',
    'Koorosh',
    'Wantons',
    'Tik Taak',
    'Sina Sa-E',
    'Sina Mafee',
    'Catchybeatz',
    'Putak',
    '021kid',
    'Emad Ghavidel',
    'Bahram',
    'Reza Pishro'
]

artists_english_to_persian_map = {
    'Hichkas': "هیچکس",
    'Zedbazi': "زدبازی",
    'Sepehr Khalse': "خلسه",
    'Mehrad Hidden': "هیدن",
    'Saman Wilson': "ویلسون",
    'Sohrab MJ': "سهراب",
    'Sijal': "سیجل",
    'Alireza JJ': "جیجی",
    'Ali Sorena': "سورنا",
    'Fadaei': "فدایی",
    'Ho3ein': "حصین",
    'Quf': "قاف",
    'Gdaal': "جیدال",
    'Erfan': "عرفان",
    'Taham': "طهم",
    'Behzad Leito': "لیتو",
    'Hiphopologist': "هیپهاپولوژیست",
    'Shayea': "شایع",
    'Ali Owj': "اوج",
    'Amirali A2': "ایتو",
    'Amir Tataloo': "تتلو",
    'Koorosh': "کوروش",
    'Wantons': "وانتونز",
    'Tik Taak': "تیکتاک",
    'Sina Sa-E': "ساعی",
    'Sina Mafee': "مافی",
    'Catchybeatz': "کچیبیتز",
    'Putak': "پوتک",
    '021kid': "کودک",
    'Emad Ghavidel': "قویدل",
    'Bahram': "بهرام",
    'Reza Pishro': "پیشرو",
}

## Crawling Raps

In [None]:
genius = lyricsgenius.Genius("HGNAkuEcjgTQ5qfNXv7gnHrO5efWfMAGlCOqvMW7U4I9p-fKCFdkRt7-KrbdzQ9W")

data = {}
for artist in artists:
    artist_persian_name = artists_english_to_persian_map[artist]
    if artist_persian_name not in data:
        data[artist_persian_name] = {}
    artist_data = genius.search_artist(artist)
    for song in artist_data.songs:
        data[artist_persian_name][song.title] = song.lyrics

## Defining Functions For Cleaning Up Data

In [None]:
def clean_lines(data):
    for rapper in data:
        raps = data[rapper]
        musics_to_remove = []
        for rap in raps:
            text = raps[rap]
            text_lines = text.split('\n')
            new_text_lines = []
            for line in text_lines:
                char_set = set(line)
                if '[' in char_set or '{' in char_set or 'c' in char_set or len(char_set) in [0, 1]:
                    continue
                new_text_lines.append(line)
            if len(new_text_lines) > 2:
                raps[rap] = '\n'.join(new_text_lines)
            else:
                musics_to_remove.append(rap)
        for music in musics_to_remove:
            del raps[music]


def clean_characters(data):
    for rapper in data:
        raps = data[rapper]
        for rap in raps:
            text = raps[rap]
            text_lines = text.split('\n')
            for i, line in enumerate(text_lines):
                chars_to_replace_with_space = ['\xa0', '\u200c', '\u205F', '\u2005', '\u200e', '\u200f', '\u202b']
                chars_to_remove = list(map(lambda x: str(x), range(10)))
                chars_to_remove.extend(set("-,.*#@!&\"\'?"))
                chars_to_remove.extend([char for char in string.ascii_lowercase + string.ascii_uppercase])
                for char in chars_to_replace_with_space:
                    line = line.replace(char, ' ')
                for char in chars_to_remove:
                    line = line.replace(char, '')
                text_lines[i] = line
            raps[rap] = '\n'.join(text_lines)

## Cleaning The Data

In [None]:
clean_lines(data)
clean_characters(data)
clean_lines(data)

## Normalizing Rap Texts & Adding Some of the Special Tokens

In [None]:
data_list = []
normalizer = hazm.Normalizer(persian_numbers=False)

for rapper in data:
    raps = data[rapper]
    for rap in raps:
        text = raps[rap]
        text = normalizer.normalize(text)
        data_list.append([rapper, f"{rapper}<|startoftext|>{text}".replace("\n", "<sep>").replace("\t", "<sep>").replace("<sep><sep>", "<sep>")])

## Saving Dataset To Disk as a Pandas Dataframe

In [None]:
df = pd.DataFrame(data_list, columns=["rapper", "rap"]).to_csv("data_temp.csv", index=False)