In [2]:
import pandas as pd

In [3]:
en_data = pd.read_csv("./en.en", delimiter="\n", header=None)
en_data = en_data.rename(columns={0:"sentences"})
en_data["label"] = "en"

In [21]:
fr_data = pd.read_csv("./fr.fr", delimiter="\n", header=None)
fr_data = en_data.rename(columns={0:"sentences"})
fr_data["label"] = "fr"

In [22]:
it_data = pd.read_csv("./it.it", delimiter="\n", header=None)
it_data = it_data.rename(columns={0:"sentences"})
it_data["label"] = "it"

In [23]:
data = pd.concat([en_data, fr_data, it_data])

In [24]:
data.isnull().sum()

sentences    0
label        0
dtype: int64

In [25]:
data['label'].value_counts()

it    89
fr    75
en    75
Name: label, dtype: int64

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X = data['sentences']
y = data['label']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(160, 1257)

In [30]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(160, 1257)

In [31]:
# fit our model
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [33]:
# Form a prediction set
predictions = text_clf.predict(X_test)

In [34]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[ 3 21  0]
 [22  4  0]
 [ 0  0 29]]


In [35]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

          en       0.12      0.12      0.12        24
          fr       0.16      0.15      0.16        26
          it       1.00      1.00      1.00        29

   micro avg       0.46      0.46      0.46        79
   macro avg       0.43      0.43      0.43        79
weighted avg       0.46      0.46      0.46        79



In [36]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.45569620253164556
