In [176]:
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
import time
import os
import glob
import itertools
import re
from string import punctuation
#from spellchecker import SpellChecker

# Download & Curate all Datasets

## 1.1 EG Writtings


In [None]:
def scrap_book(URL_rw, URL_en):
    #List of paragraphs to form a 
    paragraphs_rw = []
    paragraphs_en = []

    #Bool variable, if switched to False halts the scrap
    next_ = True
    while next_:
        try:
            #Requests both English and Kinyarwanda Version
            page_rw = requests.get(URL_rw)
            page_en = requests.get(URL_en)

            #Convert to parsable objectt
            content_rw = BeautifulSoup(page_rw.content, "html.parser")
            content_en = BeautifulSoup(page_en.content, "html.parser")

            #find all span content 
            page_rw = content_rw.find_all("span", class_="egw_content")
            page_en = content_en.find_all("span", class_="egw_content")

            #if both Kinyarwanda and English pages don't containt same number of paragraph drop it else append it to list
            if len(page_rw) == len(page_en):
              for item in page_rw:
                paragraphs_rw.append(str(item.get_text()).replace("...", ""))
              for item in page_en:
                paragraphs_en.append(str(item.get_text()).replace("...", ""))

            #find next url in both pages
            next_url_rw = content_rw.find_all("li", class_="next")
            next_url_en = content_en.find_all("li", class_="next")

            #if the next button is disabled then halt the scrapin on this link
            if len(content_rw.find_all("li", class_="next disabled")) > 0:
                next_ = False
            else:
              #find next link for both versions
                URL_rw = 'https://m.egwwritings.org/' + content_rw.find_all("li", class_="next")[0].findChildren()[0].attrs['href']
                URL_en = 'https://m.egwwritings.org/' + content_en.find_all("li", class_="next")[0].findChildren()[0].attrs['href']

        #Accomodate for Connection Error
        except requests.ConnectionError:
            pass
    return pd.DataFrame({'rw':paragraphs_rw, 'en':paragraphs_en})  

with open('data/eg_writtings/sources.json') as file:
    sources = json.loads(file.read())


for index, source in enumerate(sources):
    start = time.time()
    book = scrap_book(source['rw'], source['en'])
    book.to_csv(source['name']+'.csv', encoding='utf-8', index=False)
    print('Duration:', time.time() - start )
    print('Done with', index)

## 1.2 Bible Dataset

In [115]:
# Bible Dataset

with open('data/bible/bible_eng.json', encoding="utf8") as file_en:
    bible_en = json.load(file_en)
    

with open('data/bible/bible_rw.json', encoding="utf8") as file_rw:
    bible_rw = json.load(file_rw)
    
    
def parse_bible(version):
    lines = []
    index = []
    iters = 0
    err = []
    for test_index, testament in enumerate(version['bible']['testament']):
        for book_index , book in enumerate(testament['book']):
            for chapt_index, chapter in enumerate(book['chapter']):
                try:
                    for verse_index, verse in enumerate(chapter['verse']):
                        lines.append({'Testament':test_index, 'Book':book_index, 'Chapter':chapt_index, 'Line':verse_index, 'Verse': verse['#text']})
                        index.append(iters)
                        iters += 1
                except TypeError:
                    err.append(iters)
                    iters += 1
    return lines, index, err
        
en, index_en, e_en = parse_bible(bible_en)
rw, index_rw, e_rw = parse_bible(bible_rw)
    
bible_df_en = pd.DataFrame(en)

bible_df_rw = pd.DataFrame(rw)

bible_merge = bible_df_en.merge(bible_df_rw, how='inner', left_on=["Testament", "Book", "Chapter","Line"], right_on=["Testament", "Book", "Chapter","Line"])

bible = pd.DataFrame({'rw':bible_merge.Verse_y,'en':bible_merge.Verse_x})

## 1.3 Quran Dataset

In [187]:
quran_eng = pd.read_csv('data/Quran/quran_eng.csv')
quran_rw = pd.read_csv('data/Quran/quran_rw.csv')

quran = pd.DataFrame({'rw': quran_rw.translation,'en': quran_eng.translation})

# 2.0 Merging Datasets

## 2.1 Merging All EG Writtings into One Dataframe

In [160]:
#Curate all Ellen G whites Books into one csv
csv_files = glob.glob("data/eg_writtings/*.csv")

eg_corpus = pd.DataFrame({'rw':[], 'en':[]})
for book in csv_files:
    eg_corpus = eg_corpus.append(pd.read_csv(book), ignore_index=True)
    
    
#Check to see if parralel sentence contain same number of punctuation to split them into smaller sentences
new_rw = []
new_en = []
for paragraph in range(len(eg_corpus)):
    if eg_corpus.iloc[paragraph].rw.count('.') == eg_corpus.iloc[paragraph].en.count('.'):
        new_rw.extend(eg_corpus.iloc[paragraph].rw.split('.'))
        new_en.extend(eg_corpus.iloc[paragraph].en.split('.'))
        
    else:
        new_rw.append(eg_corpus.iloc[paragraph].rw)
        new_en.append(eg_corpus.iloc[paragraph].en)
        
eg_corpus = pd.DataFrame({'rw' : new_rw, 'en': new_en})

In [161]:
corpus

Unnamed: 0,rw,en
0,UMUGABANE WA I — KUVA MU MBARAGA UKABA UMUNYAN...,Section 1—From Strength to Weakness
1,“ Uwiteka avuga atya ati: ‘Umunyabwenge ye kwi...,"“Thus saith the Lord, Let not the wise man glo..."
2,IGICE CYA 3 — UBWIBONE BUTERWA NO KUGUBWA NEZA...,Chapter 3—Pride of Prosperity
3,Mu gihe cyose Salomo yubahaga amategeko y’ijur...,"While Solomon exalted the law of heaven, God w..."
4,"Ariko nyuma y’igitondo cy’amasezerano akomeye,...",But after a morning of great promise his life ...
...,...,...
21747,Ineza Kristo yamweretse igihe yamubabariraga ...,The favor which Christ had shown him in forgi...
21748,Yari afite ubuvugarikijyana bukomeye mu itorero,He had much influence in the church
21749,Nyamara icyigisho Kristo yamwigishirije ku nk...,But the lesson which Christ had taught him by...
21750,Igihe yandikiraga amatorero abibwirijwe na Mw...,"Writing by the Holy Spirit to the churches, h..."


## 2.2 Merging Bible and Quran

In [162]:
dataset = bible.append(quran, ignore_index=True)

In [163]:
dataset.head()

Unnamed: 0,rw,en
0,Mbere na mbere Imana yaremye ijuru n'isi.,In the beginning God created the heaven and th...
1,"Isi yari itagira ishusho, yariho ubusa busa, u...","And the earth was without form, and void; and ..."
2,"Imana iravuga iti “Habeho umucyo”, umucyo ubaho.","And God said, Let there be light: and there wa..."
3,"Imana ibona umucyo ko ari mwiza, Imana itanduk...","And God saw the light, that it was good: and G..."
4,"Imana yita umucyo amanywa, umwijima iwita ijor...","And God called the light Day, and the darkness..."


## 2.3 Merging Bible & Quaran with Ellen's Writings

In [164]:
dataset = dataset.append(eg_corpus, ignore_index=True)

In [165]:
dataset

Unnamed: 0,rw,en
0,Mbere na mbere Imana yaremye ijuru n'isi.,In the beginning God created the heaven and th...
1,"Isi yari itagira ishusho, yariho ubusa busa, u...","And the earth was without form, and void; and ..."
2,"Imana iravuga iti “Habeho umucyo”, umucyo ubaho.","And God said, Let there be light: and there wa..."
3,"Imana ibona umucyo ko ari mwiza, Imana itanduk...","And God saw the light, that it was good: and G..."
4,"Imana yita umucyo amanywa, umwijima iwita ijor...","And God called the light Day, and the darkness..."
...,...,...
58856,Ineza Kristo yamweretse igihe yamubabariraga ...,The favor which Christ had shown him in forgi...
58857,Yari afite ubuvugarikijyana bukomeye mu itorero,He had much influence in the church
58858,Nyamara icyigisho Kristo yamwigishirije ku nk...,But the lesson which Christ had taught him by...
58859,Igihe yandikiraga amatorero abibwirijwe na Mw...,"Writing by the Holy Spirit to the churches, h..."


# 3.0 Cleaning Script

# 3.1 Defining Stop Words & Regex Patterns

In [166]:
english_stopwords = {
    "0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz"}

kinyarwanda_stopwords = {
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
    'mu','ku','i',
    'na','nka',
    'mb', 'mf', 'mp', 'mv', 'nd', 'ng', 'ny', 'sh', 'nj', 'nk', 'ns', 'ts', 'nsh', 'nshy', 'nt', 'nz', 'bw', 'bg', 'cw', 'dw', 'fw', 'gw', 'hw', 'jw', 'kw', 'mw', 'nw', 'nyw', 'pfw', 'pw', 'rw', 'shw', 'shyw', 'sw', 'tsw', 'tw', 'vw', 'zw', 'by', 'cy', 'jy', 'my', 'nny', 'pfy', 'py', 'ry', 'sy', 'ty', 'vy', 'byw', 'myw', 'pfyw', 'ryw', 'vyw', 'mbw', 'mfw', 'mpw', 'mvw', 'ndw', 'ngw', 'njw', 'nkw', 'nshw', 'nshyw', 'nsw', 'ntw', 'nzw', 'mby', 'mpy', 'mvy', 'ncy', 'ndy', 'njy', 'nsy', 'nty', 'mbyw', 'mvyw', 'njyw',
    'aba', 'abo', 'aha', 'aho', 'ari', 'ati', 'aya', 'ayo', 'ba', 'baba', 'babo', 'bari', 'be', 'bo', 'bose','bw', 'bwa', 'bwo', 'by', 'bya', 'byo', 'cy', 'cya', 'cyo', 'hafi', 'ibi', 'ibyo', 'icyo', 'iki','imwe', 'iri', 'iyi', 'iyo', 'izi', 'izo', 'ka', 'ko', 'ku', 'kuri', 'kuva', 'kwa', 'maze', 'mu', 'muri','na', 'naho','nawe', 'ngo', 'ni', 'niba', 'nk', 'nka', 'no', 'nta', 'nuko', 'rero', 'rw', 'rwa', 'rwo', 'ry','rya','ubu', 'ubwo', 'uko', 'undi', 'uri', 'uwo', 'uyu', 'wa', 'wari', 'we', 'wo', 'ya', 'yabo', 'yari', 'ye','yo', 'yose', 'za', 'zo'}

In [168]:
patterns = {
    "email":'[0-9a-zA-Z\._-]+@[0-9a-zA-Z\._-]+\.[\s]?[a-z]+[\.]?[\s]?[\s]?[edu|com|comgt|uk|ch]*',
    "urls":r'\bhttp[s]?[:,\.\w]*[\s/\\]+[\s/\\]+[\w\s]+\w[:\.]+[\s\w]+[-\w]+[\s\.]?[\.]?[\s]?[\w{3}]*',
    "hashtags": '[#][a-z_0-9]+',
    "phone": '\(\d{3}\)-\d{3}-\d{4}|\d{10}',
    "tags":"<[a-zA-Z]*>[\s]*\w*[\s]*</[a-zA-Z]*>",
    "numbers":"[0-9]+",
    "characters":"[@_!#$%^&*()<>?/\|}{~:;-]",
    "punctuations":r'[^\w\s]',
    "pattern": "pattern"
}

## 3.2 Cleaning Function

In [182]:
# showing full data without truncanations
pd.set_option('display.max_colwidth', None)


class TextCleaning:
    def __init__(self, df):
        self.df = df
    
    def dataset(self):
        return self.df
    
    def pattern(self, pattern_name, show=False, regex=""):
        if(pattern_name in list(patterns.keys())):
            pattern_name = patterns[pattern_name]
            
            if pattern_name == "pattern":
                if regex == '':
                    return 'Please Enter a Pattern'
                pattern_name = regex
                
            for column in self.df:
                temp_column = []
                for sentence in self.df[column]:
                    temp = re.findall(pattern_name, sentence)
                    sentence = re.sub(pattern_name, "", sentence)

                    if show and temp:
                        print(sentence)
                    temp_column.append(sentence)

                self.df[column] = temp_column
            return self.df
        else:
            return "/!\ Choose among these patterns "+list(patterns.keys())
            
    def normalize(self):
        for column in self.df:
            temp = []
            for sentence in self.df[column]:
                temp.append(sentence.lower())
            self.df[column] = temp
        return self.df
    
    def stopwords(self, column):
        if column=='en':
            stopwords = english_stopwords
        elif column=='rw':
            stopwords = kinyarwanda_stopwords
        else:
            return "We don't have the stopwords for this language."
            
        temp_column = []
        for sentence in self.df[column]:
            text = " ".join([word for word in str(sentence).split() if word not in stopwords])
            temp_column.append(text)
        self.df[column] = temp_column
        return self.df
        
    def spell_checker(self, column):
        if column!='en':
            return 'rw'
        temp_column = []
        
        spell = SpellChecker()
        
        def checking(sentence):
            ls = list(sentence.split(" "))
            misspelled = spell.unknown(ls)
            for i in range(len(ls)):
                for incorrect in misspelled:
                    if ls[i]==incorrect:
                        ls[i] = spell.correction(incorrect)
            print(" ".join(str(word) for word in ls))
            return " ".join(str(word) for word in ls)
                
        for sentence in self.df[column]:
            temp_column.append(checking(sentence))
            
        self.df[column] = temp_column
        return self.df
        
    def save(self,file='final_corpus'):
        print('File Saved As '+file)
        self.df.to_csv(file+'.csv')


# 3.3 Cleaning in Action!

In [183]:
obj = TextCleaning(dataset)

obj.pattern('email')
obj.pattern('urls')
obj.pattern('hashtags')
obj.pattern('phone')
obj.pattern('tags')
obj.pattern('numbers')
obj.pattern('punctuations')

obj.normalize()

obj.stopwords(column='rw')
obj.stopwords(column='en')

obj.pattern('characters')

Unnamed: 0,rw,en
0,mbere mbere imana yaremye ijuru nisi,god created heaven earth
1,isi itagira ishusho yariho ubusa busa umwijima hejuru yimuhengeri umwuka wimana yagendagendaga hejuru yamazi,earth form void darkness face deep spirit god moved face waters
2,imana iravuga iti habeho umucyo umucyo ubaho,god light light
3,imana ibona umucyo mwiza imana itandukanya umucyo numwijima,god light good god divided light darkness
4,imana yita umucyo amanywa umwijima iwita ijoro buragoroba buracya umunsi mbere,god called light day darkness called night evening morning day
...,...,...
58856,ineza kristo yamweretse igihe yamubabariraga ubuhakanyi bwe kandi akamushinga kuragira umukumbi ndetse kuba petero yarumviye agakurikira kristo byatumye bagenzi bamugirira icyizere,favor christ forgiving apostasy entrusting feeding flock peteraposs faithfulness christ confidence brethren
58857,afite ubuvugarikijyana bukomeye itorero,influence church
58858,nyamara icyigisho kristo yamwigishirije nkombe yinyanja galileya cyakomeje kumubamo kubaho kwe kose,lesson christ taught sea galilee peter carried life
58859,igihe yandikiraga amatorero abibwirijwe mwuka yaravuze,writing holy spirit churches


In [184]:
obj.df.iloc[110:150]

Unnamed: 0,rw,en
110,iminsi adamu yaramye imyaka magana urwenda mirongo itatu arapfa,days adam lived thirty years died
111,seti yamaze imyaka ijana nitanu avutse abyara enoshi,seth lived years begat enos
112,amaze kubyara enoshi seti arongera amara imyaka magana inani nirindwi ayibyaramo abahungu nabakobwa,seth lived begat enos years begat sons daughters
113,iminsi seti yaramye imyaka magana urwenda cumi nibiri arapfa,days seth years died
114,enoshi yamaze imyaka mirongo urwenda avutse abyara kenani,enos lived years begat cainan
115,amaze kubyara kenani enoshi arongera amara imyaka magana inani cumi nitanu ayibyaramo abahungu nabakobwa,enos lived begat cainan years begat sons daughters
116,iminsi enoshi yaramye imyaka magana urwenda nitanu arapfa,days enos years died
117,kenani yamaze imyaka mirongo irindwi avutse abyara mahalalēli,cainan lived seventy years begat mahalaleel
118,amaze kubyara mahalalēli kenani arongera amara imyaka magana inani mirongo ine ayibyaramo abahungu nabakobwa,cainan lived begat mahalaleel years begat sons daughters
119,iminsi kenani yaramye imyaka magana urwenda cumi arapfa,days cainan years died


# 4.0 Saving the Final Corpus
### We save the final corpus as "final_corpus.csv"

In [185]:
obj.save()

File Saved As final_corpus
