In [1]:
import statsmodels.api as sm
import pandas as pd
import re,string
import nltk
from patsy import dmatrices
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import matplotlib.pyplot as plt
import warnings
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("edmunds_comments.csv",  encoding='utf-8')
df.head()

Unnamed: 0,date,username,post,quotes
0,September 7,dino001,If they keep it around in next four-five years...,The lease rate is the factor that stops me col...
1,September 7,circlew,The lease rate is the factor that stops me col...,show previous quotes\n\n\nThe Stinger is too b...
2,September 8,qbrozen,circlew said:\nThe lease rate is the factor th...,
3,September 9,FlightNurse2,roadburner said:\nshow previous quotes\n\n\nTh...,"Again, the local dealer are hopeless at best(W..."
4,September 9,roadburner,"Again, the local dealer are hopeless at best(W...","Thanks, but I am more than aware of the distin..."


In [3]:
#clean punctuation and white spaces
df["post_clean"] = df.post.apply(lambda x:re.sub(r"[^\w\s]", " ", x.lower()))
df["post_clean"] = df.post_clean.apply(lambda x:re.sub(r"\n", " ", x.lower()))
df["post_clean"] = df.post_clean.apply(lambda x:re.sub(r'\s+', ' ', x))

In [11]:
#removing stop words
stop = set(stopwords.words('english'))
df['post_clean'] = df.post_clean.apply(lambda x: [word for word in x.split() if word not in stop])

In [4]:
df.head(20)

Unnamed: 0,date,username,post,quotes,post_clean
0,September 7,dino001,If they keep it around in next four-five years...,The lease rate is the factor that stops me col...,if they keep it around in next four five years...
1,September 7,circlew,The lease rate is the factor that stops me col...,show previous quotes\n\n\nThe Stinger is too b...,the lease rate is the factor that stops me col...
2,September 8,qbrozen,circlew said:\nThe lease rate is the factor th...,,circlew said the lease rate is the factor that...
3,September 9,FlightNurse2,roadburner said:\nshow previous quotes\n\n\nTh...,"Again, the local dealer are hopeless at best(W...",roadburner said show previous quotes the sting...
4,September 9,roadburner,"Again, the local dealer are hopeless at best(W...","Thanks, but I am more than aware of the distin...",again the local dealer are hopeless at best wh...
5,September 9,FlightNurse2,"roadburner said:\nAgain, the local dealer are ...",I am the poster formerly known as benjaminh an...,roadburner said again the local dealer are hop...
6,September 9,roadburner,"Thanks, but I am more than aware of the distin...",show previous quotes\nI sent you a message,thanks but i am more than aware of the distinc...
7,September 9,dino001,"roadburner said:\nThanks, but I am more than a...",,roadburner said thanks but i am more than awar...
8,September 24,benjaminhf,I am the poster formerly known as benjaminh an...,If I do manage to upgrade to a TLX it would ha...,i am the poster formerly known as benjaminh an...
9,September 7,dino001,If they keep it around in next four-five years...,The lease rate is the factor that stops me col...,if they keep it around in next four five years...


### Replacing model with make in the corpus

In [6]:
#importing model list
car_models = pd.read_csv("car_models_curated_V2.csv")
car_models.head()

Unnamed: 0,make,model1
0,MAZDA,2
1,MAZDA,3
2,MAZDA,5
3,MAZDA,6
4,ARCTIC CAT,50


In [7]:
car_models["make"] = car_models["make"].apply(lambda x:re.sub(r"[^\w\s]", "", x.lower()))
car_models["model1"] = car_models["model1"].apply(lambda x:re.sub(r"[^\w\s]", "", x.lower()))

In [8]:
#converting into dictionary
#This will help in identifying make for corresponding model
car_dict = car_models.set_index('model1').transpose().to_dict()
model = "beetle"
car_dict[model]['make']

  


In [12]:
#replace model with make
model1 = set(car_models["model1"])
df["post_c2"] = df.post_clean.map(lambda x: [car_dict[i]['make'] if i in model1 else i for i in x])

In [None]:
#lemmatizing
#need to get POS for lemmatizing
#create a function that would return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
#wnl = WordNetLemmatizer()
#df['post_lema'] = df.post_clean.apply(lambda x: [wnl.lemmatize(word) for word in x])

### Checking frequency for all values

In [17]:
#getting frequency distribution
car_posts = df['post_c2'].sum()

In [18]:
all_words = nltk.FreqDist(car_posts)
rslt_all = pd.DataFrame.from_dict(all_words,orient='index').reset_index()
rslt_all.columns = ['word','frequency']
rslt_all.head()

Unnamed: 0,word,frequency
0,keep,699
1,around,699
2,next,1398
3,four,1398
4,ford,2097


In [20]:
rslt_all[rslt_all['word'].isin(car_models["make"])]

Unnamed: 0,word,frequency
4,ford,2097
13,kia,1398
33,bmw,2796
34,polaris,9087
83,freightliner,2796
94,chevrolet,699
100,cub cadet,699
113,triumph,699
132,hyundai,2796
152,infiniti,699


### Taking unique values

In [21]:
cars_unique = df.post_c2.map(lambda x:[ "".join(word) for word in sorted(set(x))]).sum() 

In [22]:
unique_words = nltk.FreqDist(cars_unique)
rslt_unique = pd.DataFrame.from_dict(unique_words,orient='index').reset_index()
rslt_unique.columns = ['word','frequency']
rslt_unique.head()

Unnamed: 0,word,frequency
0,appropriate,699
1,around,699
2,behind,699
3,better,699
4,biggest,699


In [23]:
rslt_unique[rslt_unique['word'].isin(car_models["make"])]

Unnamed: 0,word,frequency
5,bmw,1398
17,ford,1398
31,kia,1398
44,polaris,3495
75,freightliner,2097
90,chevrolet,699
100,cub cadet,699
122,triumph,699
130,hyundai,2796
143,acura,1398
