In [None]:
# Packages for data analysis
import pandas as pd
import numpy as np
import time

# Packages for visualizations
import seaborn as sns
import matplotlib.style as style

# Packages for preprocessing
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Packages for training models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
%matplotlib inline
# set plot style
sns.set()


RANDOM_STATE = 42


import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("train_set.csv")
test= pd.read_csv("test_set.csv")
train.head(10)

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...


In [None]:
test.head(10)

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.
5,6,"Ke feela dilense tše hlakilego, tša pono e tee..."
6,7,<fn>(762010101403 AM) 1495 Final Gems Birthing...
7,8,Ntjhafatso ya konteraka ya mosebetsi: Etsa bon...
8,9,u-GEMS uhlinzeka ngezinzuzo zemithi yezifo ezi...
9,10,"So, on occasion, are statistics misused."


In [None]:
#create a copy of the origional data
copy_train = train.copy()
copy_test = test.copy()

In [None]:
train["lang_id"].value_counts()

xho    3000
eng    3000
nso    3000
ven    3000
tsn    3000
nbl    3000
zul    3000
ssw    3000
tso    3000
sot    3000
afr    3000
Name: lang_id, dtype: int64

In [None]:
test["index"].value_counts()

1       1
3818    1
3794    1
3793    1
3792    1
       ..
1893    1
1892    1
1891    1
1890    1
5682    1
Name: index, Length: 5682, dtype: int64

In [None]:
train.text.duplicated(keep="first").value_counts()

False    29948
True      3052
Name: text, dtype: int64

In [None]:
train.drop_duplicates(subset="text",keep="first",inplace=True,ignore_index=True) #remove duplicate entries
train.describe()

Unnamed: 0,lang_id,text,tex,text_lower,text_cleaned
count,29948,29948,29948,29948,29948
unique,11,29948,29948,29948,29948
top,eng,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...
freq,2998,1,1,1,1


In [None]:
X = train["text"]
Y = train["lang_id"]

In [None]:
X.shape

(33000,)

In [None]:
print(train.info()) #checking the data type of each column in the train data
print('\n')
print(test.info()) #checking the data type of each column in the test data


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB
None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5682 entries, 0 to 5681
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   5682 non-null   int64 
 1   text    5682 non-null   object
dtypes: int64(1), object(1)
memory usage: 88.9+ KB
None


In [None]:
print(train.shape)
print(test.shape)

(33000, 2)
(5682, 2)


In [None]:
train.isnull().sum()

lang_id    0
text       0
dtype: int64

In [None]:
train=train.groupby('lang_id').filter(lambda x:len(x)>500).reset_index(drop=True)
print('Number of languages=>',train['lang_id'].unique())

Number of languages=> ['xho' 'eng' 'nso' 'ven' 'tsn' 'nbl' 'zul' 'ssw' 'tso' 'sot' 'afr']


In [None]:
#grouping data according to lang_id using the groupby() and agg()
copy_train_grouped=copy_train[['lang_id','lemmatized']].groupby(by='lang_id').agg(lambda x:' '.join(x)) 
copy_train_grouped.head()#viewing grouped data

In [None]:
train['lang_id'].unique()

array(['xho', 'eng', 'nso', 'ven', 'tsn', 'nbl', 'zul', 'ssw', 'tso',
       'sot', 'afr'], dtype=object)

**preprocessing**

In [None]:
train['tex']=train['text'].apply(lambda x: x.split(',,,')[0])

In [None]:

for index,text in enumerate(train['text'][35:40]):
  print('text %d:\n'%(index+1),"")

text 1:
 
text 2:
 
text 3:
 
text 4:
 
text 5:
 


In [None]:

train['text_lower']=train["text"].apply(lambda x: x.lower())

In [None]:
train['text_cleaned']=train['text_lower'].apply(lambda x: re.sub('\w*\d\w*','', x))

In [None]:
for index,text in enumerate(train['text_cleaned'][35:40]):
  print('lang_id %d:\n'%(index+1),text)

lang_id 1:
 o rekile polase ya gagwe ya lemoenkloof mo sedikeng sa jagersfontein ka ka kadimo go tswa landbank polase ya gagwe e bogolo jwa ha ka masimo a ha le phulo ya ha xolile o dira temothuo ya gagwe ka mmidi dikgomo dikolobe dikoko le merogo
lang_id 2:
 amagama aphakanyisiwe abantu abangaqokwa ngokwesigaba c somthetho kumele athunyelwe kungqongqoshe wesifundazwe noma ilunga lomkhandlu ophethe elibhekele ezempilo kuleso sifundazwe lapho inyanga isebenzela khona kakhulukazi
lang_id 3:
 the department plays a prominent and active role in the world trade organisation particularly by supporting the consolidation of the g group of developing countries to ensure that the interests of developing countries are represented in the industrial tariff negotiations
lang_id 4:
 a ku tsundzuxa huvo eka nhlayo leyi pimiweke ya muhlovo wunwana na wunwana wa mpfumelelo lowu yelanaka na wa tikhasino rheyisisi vugembuli na wejarini leswaku yi fanekele ku nyikiwa eka riphabliki kumbe eka xifundzankulu 

In [None]:
# Loading model
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

# Lemmatization with stopwords removal
test['lemmatized']=test['text'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))

NameError: ignored

In [None]:


# Lemmatization with stopwords removal
train['lemmatized']=train['text_cleaned'].apply(lambda x: ' '.join([token.lemma_ for token in list((X) if (token.is_stop==False)]))

SyntaxError: ignored

In [None]:
train.describe()

Unnamed: 0,lang_id,text,tex,text_lower,text_cleaned
count,33000,33000,33000,33000,33000
unique,11,29948,29948,29948,29948
top,xho,ngokwesekhtjheni yomthetho ophathelene nalokhu...,ngokwesekhtjheni yomthetho ophathelene nalokhu...,ngokwesekhtjheni yomthetho ophathelene nalokhu...,ngokwesekhtjheni yomthetho ophathelene nalokhu...
freq,3000,17,17,17,17


Exploratory Data Analysis


In [None]:
# creating a list for appending the preprocessed text
train_list = []
# iterating through all the text
for text in X:
       # removing the symbols and numbers
        
        text = re.sub(r'[[]]', ' ', text)
        # converting the text to lower case
        text = text.lower()
        # appending to data_list
        train_list.append(text)

Model **Building**

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20)


In [None]:
classifiers = [LinearSVC(random_state=42),
              
                LogisticRegression(random_state=42,
                                  multi_class='ovr',
                                  n_jobs=1,
                                  C=1e5,
                                  max_iter=4000),
                KNeighborsClassifier(n_neighbors=5),
               MultinomialNB(),
                ComplementNB(),
               SGDClassifier(loss='hinge',
                             penalty='l2',
                             alpha=1e-3,
                             random_state=42,
                             max_iter=5,
                             tol=None)
            ]

In [None]:
def models_building(classifiers, x_train, y_train, x_test, y_test):
   
    summary = {}

    # Pipeline to balance the classses and then to build the model
    for c in classifiers:
        c_text = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                                       max_df=0.9,
                                                       ngram_range=(1, 2))),
                             ('c', c)])

        # Logging the Execution Time for each model
        strt_tim = time.time()
        c_text.fit(x_train, y_train)
        pred = c_text.predict(x_test)
        

        # Output for each model
        summary[c.__class__.__name__] = {
            'F1-Macro': metrics.f1_score(y_test, pred,average='macro'),
            'F1-Accuracy': metrics.f1_score(y_test, pred,average='micro'),
            'F1-Weighted': metrics.f1_score(y_test, pred,average='weighted'),
                                          }
    return pd.DataFrame.from_dict(summary, orient='index')

In [None]:
classifiers_train = models_building(classifiers, x_train, y_train, x_test, y_test)
ordered_train = classifiers_train.sort_values('F1-Macro', ascending=False)
ordered_train

ooo
ooo


Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
MultinomialNB,0.99724,0.997329,0.99733
LinearSVC,0.996675,0.996828,0.996828


In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(transfd, y_train)

MultinomialNB()

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy is :",ac)

Accuracy is : 0.9986363636363637


In [None]:
paramter_grid = {'alpha': [0.1, 1, 5, 10]}  

mnb_t = Pipeline([('tfidf', TfidfVectorizer(min_df=2,
                                                max_df=0.9,
                                                ngram_range=(1, 2))),
                      ('mnb', GridSearchCV(MultinomialNB(),
                                           param_grid=param_grid,
                                           cv=5,
                                           n_jobs=-1,
                                           scoring='f1_weighted'))
                      ])

tuned_mnb.fit(x_train, y_train)  # Fitting the model

y_pred_mnb = tuned_mnb.predict(x_test)  # predicting the fit on validation set

print(classification_report(y_test, y_pred_mnb))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       581
         eng       1.00      1.00      1.00       626
         nbl       1.00      0.99      1.00       582
         nso       1.00      1.00      1.00       606
         sot       1.00      1.00      1.00       584
         ssw       1.00      1.00      1.00       612
         tsn       1.00      1.00      1.00       626
         tso       1.00      1.00      1.00       598
         ven       1.00      1.00      1.00       566
         xho       1.00      1.00      1.00       612
         zul       1.00      1.00      1.00       607

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
transfd= cv.fit_transform(x_train)
transfd.shape 

(26400, 125407)

In [None]:
y_pred = model.predict(transfd_2)

In [None]:
transfd_2=cv.transform(x_test)
transfd_2.shape

(6600, 125407)

In [None]:
a_transfd=cv.transform(test['text'])

In [None]:
y_pred_test=model.predict(a_transfd)

In [None]:
index=[num for num in range(5683)]
index=index[1:]

In [None]:
df=pd.DataFrame(index,columns=["index"])

In [None]:
df["lang_id"]=y_pred_test

In [None]:
df.head()

In [None]:
df.to_csv("ay.csv",index=False)