# Import the required packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

# Create a profile for each language, using "x_train.txt" and "y_train.txt"

In [2]:
# number of most frequent n-grams
num_frequent_ngrams = 300 

# List of languages
langs = ['English', 'Spanish', 'French', 'Portuguese', 'German', 'Dutch', 'Assamese', 
            'Thai', 'Turkish', 'Arabic', 'Kurdish', 'South Azerbaijani', 'Bulgarian', 
            'Persian', 'Modern Greek', 'Finnish', 'Hindi', 'Armenian', 'Italian', 'Japanese']

In [3]:
labels_file = 'wili-2018/labels.csv'
x_train_file = 'wili-2018/x_train.txt'
y_train_file = 'wili-2018/y_train.txt'
x_test_file = 'wili-2018/x_test.txt'
y_test_file = 'wili-2018/y_test.txt'

In [4]:
# Read labels.csv to a dataframe
labels_df = pd.read_csv(labels_file, sep=';')
labels_df.head()

Unnamed: 0,Label,English,Wiki Code,ISO 369-3,German,Language family,Writing system,Remarks,Synonyms
0,ace,Achinese,ace,ace,Achinesisch,Austronesian,,,
1,afr,Afrikaans,af,afr,Afrikaans,Indo-European,,,
2,als,Alemannic German,als,gsw,Alemannisch,Indo-European,,(ursprünglich nur Elsässisch),
3,amh,Amharic,am,amh,Amharisch,Afro-Asiatic,,,
4,ang,Old English,ang,ang,Altenglisch,Indo-European,,(ca. 450-1100),Angelsächsisch


Find language labels. These labels are used in "y_train.txt" and "y_test.txt"

In [5]:
lang_labels = list(labels_df[labels_df['English'].isin(langs)]['Label'])
lang_labels

['ara',
 'asm',
 'azb',
 'bul',
 'deu',
 'ell',
 'eng',
 'fas',
 'fin',
 'fra',
 'hin',
 'hye',
 'ita',
 'jpn',
 'kur',
 'nld',
 'por',
 'spa',
 'tha',
 'tur']

In [6]:
lang_names = (list(labels_df[labels_df['English'].isin(langs)]['English']))
lang_names

['Arabic',
 'Assamese',
 'South Azerbaijani',
 'Bulgarian',
 'German',
 'Modern Greek',
 'English',
 'Persian',
 'Finnish',
 'French',
 'Hindi',
 'Armenian',
 'Italian',
 'Japanese',
 'Kurdish',
 'Dutch',
 'Portuguese',
 'Spanish',
 'Thai',
 'Turkish']

Read language paragraphs and labels into dataframes.

In [7]:
def read_file(x_file, y_file):
    """
    Read 'wili-2018' files ('x_train.txt' & 'y_train.txt' or 'x_test.txt' & 'y_test.txt')
    into two dataframes and eliminate all languages that are not listed in lang_labels.

    @param x_file: 'x_train.txt' or 'x_test.txt'
    @type x_file: string

    @param y_file: 'y_train.txt' or 'y_test.txt'
    @type y_file: string

    @return: a tuple containing x_dataframe and y_dataframe
    @rtype: tuple
    """
    
    # Read contents of 'y_file' into a dataframe
    y_df = pd.read_csv(y_file, header=None)
    # y_df has only one column; name it 'Label'
    y_df.columns = ['Label']

    # Read contents of 'x_file' into a list of strings
    with open(x_file, encoding='utf8') as f:
        x_pars = f.readlines()
    
    # Remove all whitespace characters (such as '\n') from the beginning and the end of the strings
    x_pars = [t.strip() for t in x_pars]
    # Convert the list into a dataframe, with one column: 'Par'
    x_df = pd.DataFrame(x_pars, columns=['Par']) 
    # Just keep paragraphs of languages in lang_labels (and remove other languages)
    x_df = x_df[y_df['Label'].isin(lang_labels)]
    # Just keep languages in lang_labels
    y_df = y_df[y_df['Label'].isin(lang_labels)]

    return (x_df, y_df)

In [8]:
x_train_df, y_train_df = read_file(x_train_file, y_train_file)

In [9]:
x_train_df.head()

Unnamed: 0,Par
4,ถนนเจริญกรุง (อักษรโรมัน: Thanon Charoen Krung...
26,De spons behoort tot het geslacht Haliclona en...
29,エノが行きがかりでバスに乗ってしまい、気分が悪くなった際に助けるが、今すぐバスを降りたいと運...
34,"Mîr, navê yê sereke ku nav kurdan de ya. Lê de..."
38,Tsutinalar (İngilizce: Tsuut'ina): Kanada'da A...


In [10]:
y_train_df.head()

Unnamed: 0,Label
4,tha
26,nld
29,jpn
34,kur
38,tur


Create a list of empty strings. 
Each string of this list will contain all paragraphs of each language in "x_train.txt". 
For instance, lang_pars[0] is a string containing all 'arabic' paragraphs of 'x_train.txt'.

In [11]:
lang_pars = len(lang_labels)*[''] 
for i in range(len(x_train_df)): # traverse rows of "x_train.txt" one by one
    lang_index = lang_labels.index(y_train_df['Label'].iloc[i]) # find index of language that this row belongs to
    lang_pars[lang_index] += ' ' + x_train_df.iloc[i].values[0] # concatinate this row to the string of the corresponding language

In [12]:
lang_pars[0]

' قبل عام بالضبط وبتاريخ 21/7/2012 أعلن البغدادي خطة هدم الأسوار وبتاريخ 21/7/2013 هاجمت داعش سجنين في بغداد هما سجن التاجي وسجن بغداد المركزي. ونجحت في تهريب أكثر من 1000 معتقل . وعندما وصل جنود الحملة الفرنسية غرب مدينة الإسكندرية في 2 يوليو عام 1798 زحفوا على المدينة واحتلوها بعد مقاومة من جانب أهلها وحاكمها محمد كريم دامت ساعات. وبعد ذلك أخذ نابليون يزحف على القاهرة بطريق دمنهور، حيث استطاع الفرنسيون احتلال مدينة رشيد في 6 يوليو ووصلوا إلى الرحمانية وهي قرية على النيل وفي تلك الأثناء، كان المماليك يعدون جيشاً لمقاومة الجيوش الفرنسية بقيادة مراد بك حيث التقى الجيشان بالقرب من شبراخيت في 13 يوليو إلا أن الجيوش المملوكية هُزِمَت واضطرت إلى التقهقر فرجع مراد بك إلى القاهرة. كان بصوته الشجي المميز خطيباً مميزاً وصاحب مدرسة وطور قلّ أن تجد له مثيلا. وقد عرف الشيخ حسن زين الدين في خطابته بالإخلاص في القراءة والالتزام في الطرح الخاص في فن الخطابة الحسينية حيث إلهاب العواطف ومخاطبة العقل في نفس الوقت. فكان يعطي مصائب أهل البيت من وقت القراءة الكثير وكذلك الالتزام بالسيرة العامة لأهل البيت، 

Generate n-gram frequences, for n=1 to 5, using NLTK's CountVectorizer

In [13]:
def create_bag_of_words_df(lang_pars):
    """
    Get a list of paragraphs and compute n-gram frequencies, for n=1 to 5.

    @param lang_pars: a list of paragraphs
    @type lang_pars: list of strings

    @return: n-gram frequencies as a dataframe with n-grams as columns and 
    frequencies of each paragraph as row values
    @rtype: pandas.DataFrame
    """

    # Extract n-gram frequencies, for n=1 to 5, using character n-grams 
    # only from text inside word boundaries; n-grams at the edges of words are padded with space.
    vect = CountVectorizer(ngram_range=(1,5), analyzer='char_wb')
    # Get ngram-paragraph matrix
    bag_of_words = vect.fit_transform(lang_pars)
    # Convert the matrix into datarame for easy processing
    bag_of_words_df = pd.DataFrame(bag_of_words.toarray(), columns=vect.get_feature_names())
    return bag_of_words_df

In [14]:
bag_of_words_df = create_bag_of_words_df(lang_pars)

In [15]:
bag_of_words_df.head()

Unnamed: 0,Unnamed: 1,!,!.1,!),!).,!)..1,"!,","!,.1",!.,!..1,...,～伝」「,～伝」「～,～紅,～紅白,～紅白歌,～紅白歌合,～行,～行状,～行状」,～行状」と
0,71080,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,58576,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,45286,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,57622,1,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,61144,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
bag_of_words_df.shape

(20, 1294768)

Create language profiles

In [17]:
def most_frequent_ngrams(s):
    """
    Get a pandas Series containing n-gram frequencies and return 'num_frequent_ngrams' most frequent ngrams

    @param s: n-gram frequencies
    @type s: pandas.Series
        
    @return: List of most frequent n-grams
    @rtype: pandas.core.indexes.base.Index
    """
    return s.sort_values(ascending=False).index[:num_frequent_ngrams]

In [18]:
lang_profiles = [] # A list containing language profiles

In [19]:
for i in range(len(lang_labels)):
    lang_profiles.append(most_frequent_ngrams(bag_of_words_df.iloc[i]))

In [20]:
# language profile of Arabic
lang_profiles[0]

Index([' ', 'ا', 'ل', 'ي', 'ال', 'م', 'و', ' ا', 'ن', ' ال',
       ...
       'فا', 'هو', ' الج', ' مع', 'رك', 'دم', ' بال', ' أن ', ' خ', ' بع'],
      dtype='object', length=300)

In [21]:
# language profile of English
lang_profiles[6]

Index([' ', 'e', 'a', 't', 'i', 'o', 'n', 's', 'r', 'h',
       ...
       'ay', 's.', 'mp', 'ive', 'k ', 'ba', 'ith', 'fe', 'cr', 'pl'],
      dtype='object', length=300)

# Predict language of a test paragraph

In [22]:
def lang_predictor(test_profile, lang_profiles):
    """
    Using language profiles learned from train files, predict the language of test_profile.
    The function computes distance of test_profile from each train language profile and
    return language with minimum distance.

    @param test_profile: language profile of a test paragraph
    @type test_profile: list

    @param lang_profiles: list of previously learned language profiles
    @type lang_profiles: list

    @return: label of the predicted language
    """
    # Compute distance of test_profile from each train language profile.
    # Distance criteria: Canvar-Trenkle distance
    distances = []
    for pr in lang_profiles:
        pr_distance = 0
        for n_gram in test_profile:
            if n_gram in pr:
                # Determine how far out of place an n-gram in test_profile is from its place in each of lang_profiles.
                d = list(pr).index(n_gram) - list(test_profile).index(n_gram)
            else:
                # n-gram is not in any of lang_profiles, so it takes maximum out-of-place distance (num_frequent_ngrams=300)
                d = num_frequent_ngrams
            # distance: sum of all of the out-of-place values for all n-grams
            pr_distance += d

        distances.append(pr_distance)
        
    # return label of language with minimum distance 
    return lang_labels[np.argmin(distances)]

In [23]:
test_par = "In 1978 Johnson was awarded an American Institute of Architects Gold Medal. \
                In 1979 he became the first recipient of the Pritzker Architecture Prize \
                the most prestigious international architectural award."

In [24]:
test_bag_of_words_df = create_bag_of_words_df([test_par])

In [25]:
test_profile = most_frequent_ngrams(test_bag_of_words_df.iloc[0])

In [26]:
predicted_lang_label = lang_predictor(test_profile, lang_profiles)

In [27]:
predicted_lang_label

'eng'

In [28]:
if predicted_lang_label in lang_labels:
    # Language name of the corresponding language label
    predicted_lang = lang_names[lang_labels.index(predicted_lang_label)]
else: # Language name not found, return language label
    predicted_lang = predicted_lang_label

In [29]:
predicted_lang

'English'

# Test another language

In [30]:
test_par = "سنی سئویرم"

In [31]:
test_bag_of_words_df = create_bag_of_words_df([test_par])

In [32]:
test_profile = most_frequent_ngrams(test_bag_of_words_df.iloc[0])

In [33]:
predicted_lang_label = lang_predictor(test_profile, lang_profiles)

In [34]:
predicted_lang_label

'azb'

In [35]:
if predicted_lang_label in lang_labels:
    # Language name of the corresponding language label
    predicted_lang = lang_names[lang_labels.index(predicted_lang_label)]
else: # Language name not found, return language label
    predicted_lang = predicted_lang_label

In [36]:
predicted_lang

'South Azerbaijani'

# Measure the accuracy of the method on 'x_test.txt' and 'y_test.txt'

Create profiles for paragraphs of 'x_test.txt' and predict their language label. Then, compare with actual labels listed in 'y_test.txt' and computes the accuracy.

In [37]:
# Read 'x_test.txt' and 'y_test.txt' into dataframes
x_test_df, y_test_df = read_file(x_test_file, y_test_file)

In [38]:
print("'x_test.txt' has %d paragraphs." %(len(x_test_df)))

'x_test.txt' has 10000 paragraphs.


In [39]:
# This list will contain predicted language labels for test paragraphs of 'x_test.txt'
predictions = []

In [40]:
for i in range(len(x_test_df)): 
    # Compute n-gram frequencies for each paragraph listed in x_test_df, which is indicated by x_test_df.iloc[i].values[0]
    x_test_bag_of_words_df = create_bag_of_words_df(x_test_df.iloc[i].values)
    # Compute profile for each paragraph
    par_profile = most_frequent_ngrams(x_test_bag_of_words_df.iloc[0])
    # Predict language label for the test paragraph
    predicted_lang = lang_predictor(par_profile, lang_profiles)
    predictions.append(predicted_lang)
    if i%1000 == 0:
        print("Predicting language for %dth paragraph of 'x_test.txt'" %(i))

Predicting language for 0th paragraph of 'x_test.txt'
Predicting language for 1000th paragraph of 'x_test.txt'
Predicting language for 2000th paragraph of 'x_test.txt'
Predicting language for 3000th paragraph of 'x_test.txt'
Predicting language for 4000th paragraph of 'x_test.txt'
Predicting language for 5000th paragraph of 'x_test.txt'
Predicting language for 6000th paragraph of 'x_test.txt'
Predicting language for 7000th paragraph of 'x_test.txt'
Predicting language for 8000th paragraph of 'x_test.txt'
Predicting language for 9000th paragraph of 'x_test.txt'


Compute accuracy

Note that y_test_df contains actual language labels for x_test_df

In [41]:
acc = accuracy_score(y_test_df, predictions)
print("Accuracy: ", acc)

Accuracy:  0.978
