# Load Sample text data into dataframe

In [166]:
from sklearn.datasets import fetch_20newsgroups
import StringIO
import pandas as pd

newsgroups_train = fetch_20newsgroups(subset='train')

def parseDocument(data):
    buf = StringIO.StringIO(data)
    line=buf.readline()
    data=[]
    subject=''
    while line:
        if(line.startswith('Subject:')):
            subject = line[8:].strip()
        elif (line.startswith('Lines:')):
               lines = line[6:]
               while line :
                    line = buf.readline()
                    data.append(line)
        line=buf.readline()
    text = ''.join(data)
    
    return subject,text


textlist = []
df = pd.DataFrame(columns=['text'])
for data in newsgroups_train.data[0:1000]:
    subject,text = parseDocument(data)
    df.loc[subject]=text
df.head()

Unnamed: 0,text
WHAT car is this!?,\n I was wondering if anyone out there could e...
SI Clock Poll - Final Call,NNTP-Posting-Host: carson.u.washington.edu\n\n...
PB questions...,"\nwell folks, my mac plus finally gave up the ..."
Re: Weitek P9000 ?,Distribution: world\nNNTP-Posting-Host: amber....
Re: Shuttle Launch Question,\nIn article <15APR199320340428@stdvax> abdkw@...


## Vectorize text with TfIDF

In [167]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['text'].tolist())

print(vectors.shape)

(863, 29183)


# Extract feature by using NMF
<p>
it will use 20 features 

In [158]:
from sklearn.decomposition import NMF

vector_array = vectors.toarray()
nmf = NMF(n_components=40)
nmf.fit(vector_array)
features = nmf.transform(vector_array)

In [159]:
print(features[0:2])


[[ 0.          0.          0.          0.          0.          0.0286214
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.00065824  0.          0.          0.
   0.          0.          0.00099252  0.          0.          0.
   0.56553956  0.00296642  0.          0.          0.          0.          0.        ]
 [ 0.01393805  0.00437501  0.          0.00150891  0.01088364  0.0386125
   0.03420401  0.02165273  0.          0.          0.0037285   0.
   0.00311783  0.          0.          0.          0.          0.
   0.0010077   0.          0.          0.0078043   0.          0.          0.
   0.          0.00323219  0.          0.02547858  0.          0.          0.
   0.          0.00177706  0.          0.          0.          0.
   0.00807561  0.00575439]]


## Normalize extracted feature set

In [160]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
norm_features=normalizer.fit_transform(features)

print(norm_features[0:2])

[[ 0.          0.          0.          0.          0.          0.05054352
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.0011624   0.          0.          0.
   0.          0.          0.00175272  0.          0.          0.
   0.99870591  0.00523851  0.          0.          0.          0.          0.        ]
 [ 0.21229524  0.06663724  0.          0.02298281  0.16577248  0.58812045
   0.52097317  0.32980027  0.          0.          0.0567901   0.
   0.04748882  0.          0.          0.          0.          0.
   0.01534868  0.          0.          0.11887007  0.          0.          0.
   0.          0.04923065  0.          0.38807305  0.          0.          0.
   0.          0.02706698  0.          0.          0.          0.
   0.12300241  0.08764713]]


In [161]:
df_features = pd.DataFrame(norm_features,index=df.index.tolist())
df_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
WHAT car is this!?,0.0,0.0,0.0,0.0,0.0,0.050544,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.998706,0.005239,0.0,0.0,0.0,0.0,0.0
SI Clock Poll - Final Call,0.212295,0.066637,0.0,0.022983,0.165772,0.58812,0.520973,0.3298,0.0,0.0,...,0.0,0.0,0.0,0.027067,0.0,0.0,0.0,0.0,0.123002,0.087647
PB questions...,0.646528,0.077892,0.0,0.088898,0.0,0.093041,0.016277,0.041344,0.07173,0.308199,...,0.062342,0.009374,0.063816,0.119892,0.12528,0.0,0.576505,0.0,0.052305,0.143862
Re: Weitek P9000 ?,0.126321,0.054067,0.00957,0.0,0.0,0.072123,0.688106,0.0,0.0,0.087279,...,0.0,0.0,0.048083,0.0,0.0,0.001528,0.203246,0.031545,0.0,0.0
Re: Shuttle Launch Question,0.0,0.002501,0.181756,0.083245,0.0,0.0,0.0,0.0,0.328827,0.0,...,0.227453,0.000895,0.15854,0.01659,0.0,0.154087,0.453911,0.595473,0.0,0.306844


# find similarities

In [162]:
article = df_features.loc['WHAT car is this!?']
print article


0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
5     0.050544
6     0.000000
7     0.000000
8     0.000000
9     0.000000
10    0.000000
11    0.000000
12    0.000000
13    0.000000
14    0.000000
15    0.000000
16    0.000000
17    0.000000
18    0.000000
19    0.000000
20    0.000000
21    0.000000
22    0.000000
23    0.001162
24    0.000000
25    0.000000
26    0.000000
27    0.000000
28    0.000000
29    0.001753
30    0.000000
31    0.000000
32    0.000000
33    0.998706
34    0.005239
35    0.000000
36    0.000000
37    0.000000
38    0.000000
39    0.000000
Name: WHAT car is this!?, dtype: float64


In [164]:
similarities=df_features.dot(article)
top=similarities.nlargest()

texts = df.loc[top.index]['text'].tolist()
i = 0
for text in texts:
    print('TITLE :'+top.index[i]+" Similarities:"+ str(top[i]))
    #print(text+'\n')
    i = i+1

TITLE :WHAT car is this!? Similarities:1.0
TITLE :Re: WHAT car is this!? Similarities:0.999080385281
TITLE :Re: New break pads & exhausts after 96K km (60K mi) on '90 Maxima? Similarities:0.980421814633
TITLE :Insurance Rates on Performance Cars SUMMARY Similarities:0.945184088039
TITLE :Re: What is " Volvo " ? Similarities:0.935911211878


In [165]:
i = 0
for text in texts:
    print('TITLE :'+top.index[i]+" Similarities:"+ str(top[i]))
    print(text+'\n')
    i = i+1

TITLE :WHAT car is this!? Similarities:1.0

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----






TITLE :Re: WHAT car is this!? Similarities:0.999080385281

In article <1993Apr20.174246.14375@wam.umd.edu> lerxst@wam.umd.edu (where's my  
thing) writes:
> 
>  I was wondering if anyone out there could enlighten me on this car I saw
> the other day. It was a 2-door sports car, looked to be from the late 60s/
> early 70s. It was called a Bricklin. The doors were really small. In  
addition,
> the fro