In [1]:
import numpy as np
import pandas as pd
import matplotlib as mlp
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')
train_data

Unnamed: 0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.
0,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
1,1,Q-workshop,Q-workshop is a Polish company located in Poz...
2,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
3,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...
4,1,The Unsigned Guide,The Unsigned Guide is an online contacts dire...
5,1,Rest of the world,Within sports and games played at the interna...
6,1,Globoforce,Globoforce is a multinational company co-head...
7,1,Rompetrol,The Rompetrol Group N.V. is a Romanian oil co...
8,1,Wave Accounting,Wave is the brand name for a suite of online ...
9,1,Angstrem (company),Angstrem Group (Russian: ОАО «Ангстрем» named...


In [3]:
train_data.columns = ['a','b','c']
uni = pd.unique(train_data['a'])
train_data.head()

Unnamed: 0,a,b,c
0,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
1,1,Q-workshop,Q-workshop is a Polish company located in Poz...
2,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
3,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...
4,1,The Unsigned Guide,The Unsigned Guide is an online contacts dire...


In [4]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text
train_data['c'] = train_data['c'].apply(cleanText)
train_data

Unnamed: 0,a,b,c
0,1,Schwan-Stabilo,schwanstabilo german maker pens writing colour...
1,1,Q-workshop,qworkshop polish company located pozna special...
2,1,Marvell Software Solutions Israel,marvell software solutions israel known radlan...
3,1,Bergan Mercy Medical Center,bergan mercy medical center hospital located o...
4,1,The Unsigned Guide,unsigned guide online contacts directory caree...
5,1,Rest of the world,within sports games played international compe...
6,1,Globoforce,globoforce multinational company coheadquarter...
7,1,Rompetrol,rompetrol group nv romanian oil company operat...
8,1,Wave Accounting,wave brand name suite online small business so...
9,1,Angstrem (company),angstrem group russian named angstrom group ru...


In [5]:
x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split(train_data["c"],train_data["a"],test_size = 0.5)

In [6]:
x_train

265109    reckart mill also known albright mill historic...
408405    abarema acreana tree species legume family fab...
231392    ps rms douglas ii 45470 second vessel lines hi...
23598     isgec heavy engineering ltd diversified global...
289485                           chapelco river river chile
446200    memories best elaine paige compilation album e...
349136    mashaye 5 gazabad persian 5 also romanized mas...
427348    rhynchopsidium genus flowering plants daisy fa...
33071     gooderham worts canadian company largest disti...
117705    monster maezuka maezuka monsuta born atsushi m...
96729     dorothy whipple ne stirrup 1893 blackburn lanc...
175904    william mordaunt furneaux dean winchester earl...
178984    charles elson roemer iv known chas roemer born...
290352    trus madi range range mountains sabah borneo i...
234549    curtiss v8 motorcycle 269 cu 4410 cc v8 engine...
520375    father hilarys holiday 1965 novel scottish wri...
74064     joseph vaz college wennappuwa 

In [7]:
tfidf = TfidfVectorizer(stop_words= 'english', sublinear_tf= True)
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [8]:
tfidf_fitted = tfidf.fit(x_train)

In [9]:
tfidf_scores = tfidf_fitted.transform(x_train)

In [10]:
tfidf_scores

<279999x491147 sparse matrix of type '<class 'numpy.float64'>'
	with 6732697 stored elements in Compressed Sparse Row format>

In [11]:
x_test

452628    glint kindling folk album released 1979 robin ...
44060     tecumseh high school public high school near n...
426986    nepenthes copelandii npniz koplndia edwin cope...
52254     central conservatory music simplified chinese ...
345037    pir mikail persian also romanized pr mkl also ...
489422    kreutzer sonata 2008 film directed bernard ros...
339097    avclar village district ske aydn province turk...
63348     jnana prabodhini prashala high school located ...
148630    jean mathonet 6 october 1925 bverc malmedy 22 ...
84309     patricia buckley moss also known p buckley mos...
424427    ambrosina genus family araceae consists one sp...
351147    sawowice swavvits village administrative distr...
472734    mitch together standup comedian mitch hedbergs...
426073    buddleja hever pride cultivar hybrid buddleja ...
416974    acrotriche depressa commonly known native curr...
118684    felix stone moscheles 8 february 1833 22 decem...
199028    guillaumeandr fauteux october 

In [12]:
# Deicsion Tree classifier
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(tfidf_scores, y_train)

In [13]:
tfidf_scores_test = tfidf_fitted.transform(x_test)

In [14]:
tfidf_scores_test

<280000x491147 sparse matrix of type '<class 'numpy.float64'>'
	with 6398859 stored elements in Compressed Sparse Row format>

In [15]:
accuracy = clf.score(tfidf_scores_test,y_test)
accuracy

0.9229

In [None]:
# Ordinary Least Squares
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(tfidf_scores, y_train)

In [None]:
reg.score(tfidf_scores_test,y_test)

In [None]:
# Ridge regression and classification
from sklearn import linear_model
reg = linear_model.Ridge(alpha=.5)
reg.fit(tfidf_scores, y_train)

In [None]:
reg.score(tfidf_scores_test,y_test)

In [None]:
# SVM Classifier
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(tfidf_scores, y_train)

In [None]:
clf.score(tfidf_scores_test,y_test)

In [None]:
# SVM Regressor
clf = svm.LinearSVR()
clf.fit(tfidf_scores, y_train)

In [None]:
clf.score(tfidf_scores_test,y_test)