In [1]:
import numpy as np
import pandas as pd
import matplotlib as mlp
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.
0,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
1,1,Q-workshop,Q-workshop is a Polish company located in Poz...
2,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
3,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...
4,1,The Unsigned Guide,The Unsigned Guide is an online contacts dire...


In [3]:
train_data.columns = ['a','b','c']
uni = pd.unique(train_data['a'])
train_data.head()

Unnamed: 0,a,b,c
0,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
1,1,Q-workshop,Q-workshop is a Polish company located in Poz...
2,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
3,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...
4,1,The Unsigned Guide,The Unsigned Guide is an online contacts dire...


In [4]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text
train_data['c'] = train_data['c'].apply(cleanText)
train_data.head()

Unnamed: 0,a,b,c
0,1,Schwan-Stabilo,schwanstabilo german maker pens writing colour...
1,1,Q-workshop,qworkshop polish company located pozna special...
2,1,Marvell Software Solutions Israel,marvell software solutions israel known radlan...
3,1,Bergan Mercy Medical Center,bergan mercy medical center hospital located o...
4,1,The Unsigned Guide,unsigned guide online contacts directory caree...


In [5]:
x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split(train_data["c"],train_data["a"],test_size = 0.9)

In [7]:
tfidf = TfidfVectorizer(stop_words= 'english', sublinear_tf= True)
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [8]:
tfidf_fitted = tfidf.fit(x_train)

In [9]:
tfidf_scores = tfidf_fitted.transform(x_train)

In [10]:
tfidf_scores

<55999x167507 sparse matrix of type '<class 'numpy.float64'>'
	with 1342863 stored elements in Compressed Sparse Row format>

In [11]:
# Deicsion Tree classifier
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(tfidf_scores, y_train)

In [12]:
tfidf_scores_test = tfidf_fitted.transform(x_test)

In [13]:
tfidf_scores_test

<504000x167507 sparse matrix of type '<class 'numpy.float64'>'
	with 11097136 stored elements in Compressed Sparse Row format>

In [14]:
accuracy = clf.score(tfidf_scores_test,y_test)
accuracy

0.8978214285714285

In [15]:
# Ordinary Least Squares
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(tfidf_scores, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [16]:
reg.score(tfidf_scores_test,y_test)

0.72280089538728

In [17]:
# Ridge regression and classification
from sklearn import linear_model
reg = linear_model.Ridge(alpha=.5)
reg.fit(tfidf_scores, y_train)

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [18]:
reg.score(tfidf_scores_test,y_test)

0.8203137396313332

In [19]:
# SVM Classifier
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(tfidf_scores, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [20]:
clf.score(tfidf_scores_test,y_test)

0.9734007936507937

In [21]:
# SVM Regressor
clf = svm.LinearSVR()
clf.fit(tfidf_scores, y_train)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0)

In [22]:
clf.score(tfidf_scores_test,y_test)

0.8216705094216661