In [9]:
# Importing libraries
import pandas as pd
import numpy as np

In [10]:
# Reading the data into the dataframe
df = pd.read_csv('/Users/aravindnambiar/Desktop/Language Detection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [11]:
# Analysing the dataframe tail
df.tail()

Unnamed: 0,Text,Language
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada
10336,ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...,Kannada


In [12]:
# Analysing the value counts of languages
df['Language'].value_counts()

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

In [13]:
# Analysing the dataframe shape
df.shape

(10337, 2)

In [14]:
# Checking for null values in the dataframe
df.isnull().sum()

Text        0
Language    0
dtype: int64

In [15]:
# Displaying the dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10337 entries, 0 to 10336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      10337 non-null  object
 1   Language  10337 non-null  object
dtypes: object(2)
memory usage: 161.6+ KB


In [16]:
# Describing the dataframe
df.describe()

Unnamed: 0,Text,Language
count,10337,10337
unique,10267,17
top,Jag är ledsen.,English
freq,3,1385


In [17]:
# Importing libraries fir text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [18]:
# Text tokenisation and stopwords removal
ps= PorterStemmer()
corpus=[]
for i in range(len(df['Text'])):
    rev = re.sub('^[a-zA-Z]','',df['Text'][i])
    rev = rev.lower()
    rev = rev.split()
    stem_word_list=[]
    for word in rev:
        if word not in stopwords.words():
            stem_word = ps.stem(word)
            stem_word = re.sub(r'[^a-zA-Z]',' ',stem_word)
            stem_word_list.append(stem_word)
    corpus.append(' '.join(map(str,stem_word_list)))


In [19]:
# Cleaned text corpus
corpus

['nature  broadest sense  natural  physical  materi world universe ',
 ' nature  refer phenomena physic world  life general ',
 'studi natur large  only  part science ',
 'lthough human part nature  human activ understood separ categori natur phenomena ',
 '    word natur borrow french natur deriv latin word natura   essenti qualities  innat disposition   ancient times  liter meant  birth  ',
 '    ancient philosophy  natura latin translat greek word physi          origin relat intrins characterist plants  animals  featur world develop accord ',
 '       concept natur whole  physic universe  expans origin notion     began core applic word       pre socrat philosoph  though word dynam dimens then  especi heraclitus   steadili gain currenc since ',
 'ure advent modern scientif method centuries  natur passiv reality  organ move divin laws ',
 '       industri revolution  natur increasingli part realiti depriv intent intervent   consid sacr tradit  rousseau  american transcendentalism  dec

In [20]:
# Converting text to vectors (Independent feature)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=200000)
X = cv.fit_transform(corpus)
X.shape

(10337, 24400)

In [21]:
# Converting language labels to vectors (Dependent feature)
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
y=label.fit_transform(df['Language'])
y.shape

(10337,)

In [22]:
# Making a language list
language_list=[]
for language in df['Language']:
    language_list.append(language)

language_list = np.expand_dims(language_list,axis=1)
language_list
    

array([['English'],
       ['English'],
       ['English'],
       ...,
       ['Kannada'],
       ['Kannada'],
       ['Kannada']], dtype='<U10')

In [23]:
# Importing libraries for train test split, machine learning models, evaluation and hyperparameter tuning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [24]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
# Initialising the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Performing Grid Search Cross-Validation
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


# Predicting on the test set
y_pred = grid_search.predict(X_test)

In [26]:
# Final model accuracy score
accuracy_score(y_pred,y_test)

0.6373307543520309