### Word Frequency

In [465]:
import string
import pandas as pd
import numpy as np
import nltk
import re
from itertools import combinations, product

nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
translator = str.maketrans('', '', string.punctuation)

[nltk_data] Downloading package stopwords to /Users/matt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1 & 2

We developed a scraper called "scrappy" that lives in the scrappy folder.

In [535]:
df = pd.read_csv('./scrappy/scrappy_do.csv')
df = df[0:5000] # first 5000
df.head()

Unnamed: 0,date,userid,message
0,2002-03-25T05:54:02+00:00,merc1,I personally think that with a few tweaks the ...
1,2002-03-25T07:06:29+00:00,fredvh,I am debating a new purchase and these two are...
2,2002-03-25T17:02:27+00:00,blueguydotcom,"Great handling, RWD, excellent engine and the ..."
3,2002-03-25T23:04:37+00:00,hungrywhale,And no manual tranny. That may not matter to y...
4,2002-03-26T00:44:13+00:00,riez,One beauty of BMW 3 Series is that there are s...


### 3

In [541]:
modelsDF = pd.read_csv('./models.csv', names=['make', 'model'])
# this dataframe contains some other mappings like seat => seats
# for the makes / models will we ignore those, they are at the end
makemodels = modelsDF[0:520]

def wordCount(post):
    d = {}
    l = str(post).lower().translate(translator).split(" ")
    filtered = [w for w in l if w not in stop_words]
    for w in filtered:
        d[w] = 1 if not d.get(w) else d[w] + 1
    return d

allText = ' '.join(list(map(lambda x: str(x), list(df['message']))))
counts = wordCount(allText)

def extractMakes(wordCounts):
    makes = {}
    for make in makemodels['make']:
        if wordCounts.get(make):
            makes[make] = wordCounts[make]
    return makes

def combineModels(makes, wordCounts):
    for row in makemodels.iterrows():
        make = row[1].make
        model = row[1].model
        if wordCounts.get(model):
            makes[make] += wordCounts.get(model)
    return makes
    
carCounts = combineModels(extractMakes(counts), counts)
carCounts = [(k, v) for k, v in carCounts.items()]
carCounts.sort(key=lambda x: x[1], reverse=True)
display(carCounts)

top10 = list(map(lambda c: c[0], carCounts[0:10]))
print('Top 10: {}'.format(top10))

[('bmw', 2430),
 ('acura', 793),
 ('audi', 780),
 ('honda', 637),
 ('toyota', 522),
 ('infiniti', 515),
 ('nissan', 468),
 ('volkswagen', 416),
 ('subaru', 351),
 ('chevrolet', 209),
 ('mercedes', 206),
 ('volvo', 195),
 ('ford', 190),
 ('hyundai', 187),
 ('mazda', 91),
 ('cadillac', 89),
 ('chrysler', 81),
 ('dodge', 65),
 ('pontiac', 65),
 ('lincoln', 58),
 ('buick', 30),
 ('kia', 25),
 ('mitsubishi', 24),
 ('suzuki', 18),
 ('mercury', 16),
 ('saturn', 12)]

Top 10: ['bmw', 'acura', 'audi', 'honda', 'toyota', 'infiniti', 'nissan', 'volkswagen', 'subaru', 'chevrolet']


### Task A.

In [582]:
def splitReplace(post):
    post = str(post).lower()
    
    post = post.translate(translator)
    modelKeys = model_to_make.keys()
    wordList = post.split(' ')
    
    for idx, word in enumerate(wordList):
        if word in modelKeys:
            wordList[idx] = model_to_make.get(word)
            
    unique = list(set(wordList))
    return unique if len(unique) else []
    

postsDF['words'] = postsDF['message'].map(lambda x: splitReplace(x))

In [583]:
def do_lift(x, y=None):
    l1, l2 = sorted(x), sorted(y or [])
    bools = pd.DataFrame()
    for e in l1 + l2:
        bools[e] = postsDF['words'].map(lambda words: e in words)

    def do(df, a, b):
        count_a_int_b = np.logical_and(df[a], df[b]).values.sum()
        count_a_mult_count_b = df[a].values.sum() * df[b].values.sum()
        return len(df.index) * count_a_int_b / count_a_mult_count_b

    result_df = pd.DataFrame(index=l1, columns=(l2 or l1))
    for a, b in (combinations(l1, 2) if y==None else product(l1, l2)):
        result_df.at[a, b] = do(bools, a, b)

    return result_df

In [584]:
do_lift(top10)

Unnamed: 0,acura,audi,bmw,chevrolet,honda,infiniti,nissan,subaru,toyota,volkswagen
acura,,2.25881,1.35088,1.34404,3.25366,3.17498,1.99418,1.6468,1.72194,1.72271
audi,,,1.66233,1.58915,1.71347,2.43379,1.23376,1.53754,1.57082,3.34349
bmw,,,,1.38111,1.18766,1.80204,1.25842,1.19879,1.4399,1.28244
chevrolet,,,,,1.84124,1.51297,2.77944,2.14877,2.54244,2.41639
honda,,,,,,1.50261,3.15628,1.8132,3.4366,2.37887
infiniti,,,,,,,3.46296,1.6095,1.81368,1.72915
nissan,,,,,,,,2.3052,4.11521,2.52408
subaru,,,,,,,,,1.65967,2.8731
toyota,,,,,,,,,,2.08856
volkswagen,,,,,,,,,,


### Task B.
##### Insights

In [451]:
# Task C.

modelWords = list(modelsDF['make'].unique()) + list(modelsDF['model'].unique())

countTuple = [(k, v) for k, v in counts.items()]
filtered = list(filter(lambda x: x[0] not in modelWords, countTuple))

filtered.sort(key=lambda x: x[1], reverse=True)
display('Example attributes sorted:', filtered[0:20])

attributes = ['performance', 'luxury', 'power', 'handling', 'speed']
print('Selected attributes are:', attributes)

'Example attributes sorted:'

[('', 3119),
 ('like', 1775),
 ('tl', 1678),
 ('one', 1472),
 ('would', 1417),
 ('dont', 1360),
 ('get', 1278),
 ('g35', 1223),
 ('think', 1195),
 ('better', 956),
 ('drive', 956),
 ('new', 921),
 ('much', 903),
 ('im', 899),
 ('performance', 885),
 ('3', 849),
 ('even', 840),
 ('people', 821),
 ('good', 797),
 ('really', 760)]

Selected attributes are: ['performance', 'luxury', 'power', 'handling', 'speed']


### Continue Tasks

In [506]:
do_lift(top10[0:5], importantWords)

Unnamed: 0,handling,luxury,performance,power,speed
acura,1.6829,2.03727,1.62576,1.85756,1.83004
audi,1.85199,1.89727,1.73898,1.93522,1.84652
bmw,1.79176,1.46202,1.62448,1.62643,1.3359
honda,1.67457,1.97196,1.51231,1.80006,1.95387
toyota,2.46224,2.7467,1.90022,1.58297,1.52278


In [515]:
do_lift(top10[0:5], ['love', 'aspire', 'want', 'dream', 'wish'])

Unnamed: 0,aspire,dream,love,want,wish
acura,1.78085,1.31374,1.87146,1.51717,1.34805
audi,3.79012,2.11818,2.05819,1.86406,1.92183
bmw,2.46141,2.07689,1.59364,1.61774,1.57191
honda,2.34205,1.15183,1.89126,1.57382,1.5396
toyota,3.67498,1.50614,1.26023,2.0677,1.68376


In [473]:
postsDF.head()

Unnamed: 0,date,userid,message,words
0,2002-03-25T05:54:02+00:00,merc1,I personally think that with a few tweaks the ...,"[acura, ford, problem, power, toyota, car, bui..."
1,2002-03-25T07:06:29+00:00,fredvh,I am debating a new purchase and these two are...,"[acura, honda]"
2,2002-03-25T17:02:27+00:00,blueguydotcom,"Great handling, RWD, excellent engine and the ...",[handling]
3,2002-03-25T23:04:37+00:00,hungrywhale,And no manual tranny. That may not matter to y...,[]
4,2002-03-26T00:44:13+00:00,riez,One beauty of BMW 3 Series is that there are s...,"[performance, sedan, bmw]"
