### Word Frequency

In [1]:
import string
import pandas as pd
import numpy as np
import nltk
import re
from itertools import combinations, product
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
translator = str.maketrans('', '', string.punctuation)

[nltk_data] Downloading package stopwords to /Users/matt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1 & 2

We developed a scraper called "scrappy" that lives in the scrappy folder.

In [2]:
df = pd.read_csv('./scrappy/scrappy_do.csv')
df = df[0:5000] # first 5000
df.head()

Unnamed: 0,date,userid,message
0,2002-03-25T05:54:02+00:00,merc1,I personally think that with a few tweaks the ...
1,2002-03-25T07:06:29+00:00,fredvh,I am debating a new purchase and these two are...
2,2002-03-25T17:02:27+00:00,blueguydotcom,"Great handling, RWD, excellent engine and the ..."
3,2002-03-25T23:04:37+00:00,hungrywhale,And no manual tranny. That may not matter to y...
4,2002-03-26T00:44:13+00:00,riez,One beauty of BMW 3 Series is that there are s...


### 3

In [3]:
modelsDF = pd.read_csv('./models.csv', names=['make', 'model'])
# this dataframe contains some other mappings like seat => seats
# for the makes / models will we ignore those, they are at the end
makemodels = modelsDF[0:520]

def wordCount(post):
    d = {}
    l = str(post).lower().translate(translator).split(" ")
    filtered = [w for w in l if w not in stop_words]
    for w in filtered:
        d[w] = 1 if not d.get(w) else d[w] + 1
    return d

allText = ' '.join(list(map(lambda x: str(x), list(df['message']))))
counts = wordCount(allText)

def extractMakes(wordCounts):
    makes = {}
    for make in makemodels['make']:
        if wordCounts.get(make):
            makes[make] = wordCounts[make]
    return makes

def combineModels(makes, wordCounts):
    for row in makemodels.iterrows():
        make = row[1].make
        model = row[1].model
        if wordCounts.get(model):
            makes[make] += wordCounts.get(model)
    return makes
    
carCounts = combineModels(extractMakes(counts), counts)
carCounts = [(k, v) for k, v in carCounts.items()]
carCounts.sort(key=lambda x: x[1], reverse=True)
display(carCounts)

top10 = list(map(lambda c: c[0], carCounts[0:10]))
print('Top 10: {}'.format(top10))

[('bmw', 2430),
 ('acura', 793),
 ('audi', 780),
 ('honda', 637),
 ('toyota', 522),
 ('infiniti', 515),
 ('nissan', 468),
 ('volkswagen', 416),
 ('subaru', 351),
 ('chevrolet', 209),
 ('mercedes', 206),
 ('volvo', 195),
 ('ford', 190),
 ('hyundai', 187),
 ('mazda', 91),
 ('cadillac', 89),
 ('chrysler', 81),
 ('dodge', 65),
 ('pontiac', 65),
 ('lincoln', 58),
 ('buick', 30),
 ('kia', 25),
 ('mitsubishi', 24),
 ('suzuki', 18),
 ('mercury', 16),
 ('saturn', 12)]

Top 10: ['bmw', 'acura', 'audi', 'honda', 'toyota', 'infiniti', 'nissan', 'volkswagen', 'subaru', 'chevrolet']


In [4]:
# Task C.

modelWords = list(modelsDF['make'].unique()) + list(modelsDF['model'].unique())
model_to_make = modelsDF.set_index('model').to_dict(orient='dict')['make']

countTuple = [(k, v) for k, v in counts.items()]
filtered = list(filter(lambda x: x[0] not in modelWords, countTuple))

filtered.sort(key=lambda x: x[1], reverse=True)
display('Example attributes sorted:', filtered[0:20])

attributes = ['performance', 'luxury', 'power', 'handling', 'speed']
print('Selected attributes are:', attributes)

'Example attributes sorted:'

[('', 3119),
 ('like', 1775),
 ('tl', 1678),
 ('one', 1472),
 ('would', 1417),
 ('dont', 1360),
 ('get', 1278),
 ('g35', 1223),
 ('think', 1195),
 ('better', 956),
 ('drive', 956),
 ('new', 921),
 ('much', 903),
 ('im', 899),
 ('performance', 885),
 ('3', 849),
 ('even', 840),
 ('people', 821),
 ('good', 797),
 ('really', 760)]

Selected attributes are: ['performance', 'luxury', 'power', 'handling', 'speed']


### Task A.

In [5]:
def splitReplace(post):
    post = str(post).lower()
    
    post = post.translate(translator)
    modelKeys = model_to_make.keys()
    wordList = post.split(' ')
    
    for idx, word in enumerate(wordList):
        if word in modelKeys:
            wordList[idx] = model_to_make.get(word)
            
    unique = list(set(wordList))
    return unique if len(unique) else []
    

df['words'] = df['message'].map(lambda x: splitReplace(x))

In [6]:
def do_lift(x, y=None):
    l1, l2 = sorted(x), sorted(y or [])
    bools = pd.DataFrame()
    for e in l1 + l2:
        bools[e] = df['words'].map(lambda words: e in words)

    def do(df, a, b):
        count_a_int_b = np.logical_and(df[a], df[b]).values.sum()
        count_a_mult_count_b = df[a].values.sum() * df[b].values.sum()
        return len(df.index) * count_a_int_b / count_a_mult_count_b

    result_df = pd.DataFrame(index=l1, columns=(l2 or l1))
    for a, b in (combinations(l1, 2) if y==None else product(l1, l2)):
        val = do(bools, a, b)
        result_df.at[a, b] = val
        if y==None:
            result_df.at[b, a] = val

    return result_df

In [7]:
do_lift(top10)

Unnamed: 0,acura,audi,bmw,chevrolet,honda,infiniti,nissan,subaru,toyota,volkswagen
acura,,2.03696,1.4918,1.40861,2.59956,3.06937,2.21297,2.18125,1.85185,1.62348
audi,2.03696,,1.63962,1.3651,1.48556,2.48269,1.52772,1.81727,1.35021,4.57699
bmw,1.4918,1.63962,,1.54958,1.26302,2.05532,1.30274,1.25957,1.43922,1.56816
chevrolet,1.40861,1.3651,1.54958,,1.31838,2.0312,1.69033,2.18404,1.63399,1.66168
honda,2.59956,1.48556,1.26302,1.31838,,1.41836,3.96257,1.51299,3.95273,2.27923
infiniti,3.06937,2.48269,2.05532,2.0312,1.41836,,2.76243,1.7778,1.19705,1.7948
nissan,2.21297,1.52772,1.30274,1.69033,3.96257,2.76243,,1.45101,3.85057,2.63004
subaru,2.18125,1.81727,1.25957,2.18404,1.51299,1.7778,1.45101,,1.40264,2.09767
toyota,1.85185,1.35021,1.43922,1.63399,3.95273,1.19705,3.85057,1.40264,,2.07156
volkswagen,1.62348,4.57699,1.56816,1.66168,2.27923,1.7948,2.63004,2.09767,2.07156,


### Task B.
##### Insights

In [8]:
# Task C.

modelWords = list(modelsDF['make'].unique()) + list(modelsDF['model'].unique())

countTuple = [(k, v) for k, v in counts.items()]
filtered = list(filter(lambda x: x[0] not in modelWords, countTuple))

filtered.sort(key=lambda x: x[1], reverse=True)
display('Example attributes sorted:', filtered[0:20])

attributes = ['performance', 'luxury', 'power', 'handling', 'speed']
print('Selected attributes are:', attributes)

'Example attributes sorted:'

[('', 3119),
 ('like', 1775),
 ('tl', 1678),
 ('one', 1472),
 ('would', 1417),
 ('dont', 1360),
 ('get', 1278),
 ('g35', 1223),
 ('think', 1195),
 ('better', 956),
 ('drive', 956),
 ('new', 921),
 ('much', 903),
 ('im', 899),
 ('performance', 885),
 ('3', 849),
 ('even', 840),
 ('people', 821),
 ('good', 797),
 ('really', 760)]

Selected attributes are: ['performance', 'luxury', 'power', 'handling', 'speed']


### Continue Tasks

In [9]:
top_10_brands = do_lift(top10)

In [10]:
topBrandsList = list(top_10_brands.columns)

In [11]:
topBrandsList

['acura',
 'audi',
 'bmw',
 'chevrolet',
 'honda',
 'infiniti',
 'nissan',
 'subaru',
 'toyota',
 'volkswagen']

In [12]:
top_10_brands

Unnamed: 0,acura,audi,bmw,chevrolet,honda,infiniti,nissan,subaru,toyota,volkswagen
acura,,2.03696,1.4918,1.40861,2.59956,3.06937,2.21297,2.18125,1.85185,1.62348
audi,2.03696,,1.63962,1.3651,1.48556,2.48269,1.52772,1.81727,1.35021,4.57699
bmw,1.4918,1.63962,,1.54958,1.26302,2.05532,1.30274,1.25957,1.43922,1.56816
chevrolet,1.40861,1.3651,1.54958,,1.31838,2.0312,1.69033,2.18404,1.63399,1.66168
honda,2.59956,1.48556,1.26302,1.31838,,1.41836,3.96257,1.51299,3.95273,2.27923
infiniti,3.06937,2.48269,2.05532,2.0312,1.41836,,2.76243,1.7778,1.19705,1.7948
nissan,2.21297,1.52772,1.30274,1.69033,3.96257,2.76243,,1.45101,3.85057,2.63004
subaru,2.18125,1.81727,1.25957,2.18404,1.51299,1.7778,1.45101,,1.40264,2.09767
toyota,1.85185,1.35021,1.43922,1.63399,3.95273,1.19705,3.85057,1.40264,,2.07156
volkswagen,1.62348,4.57699,1.56816,1.66168,2.27923,1.7948,2.63004,2.09767,2.07156,


In [13]:
do_lift(top10[0:5], ['love', 'aspire', 'want', 'dream', 'wish'])

Unnamed: 0,aspire,dream,love,want,wish
acura,2.39464,1.79598,1.71691,1.36836,2.08229
audi,0.0,1.58228,1.55242,1.48375,1.37589
bmw,2.40941,1.68658,1.56384,1.5392,1.25708
honda,1.52812,0.0,1.90294,1.43296,1.3288
toyota,2.08333,2.08333,0.86478,1.73993,2.17391


In [14]:
dissimilarity_matrix = 1/top_10_brands
np.fill_diagonal(dissimilarity_matrix.values, 0)
dissimilarity_matrix

Unnamed: 0,acura,audi,bmw,chevrolet,honda,infiniti,nissan,subaru,toyota,volkswagen
acura,0.0,0.490929,0.670331,0.70992,0.384681,0.3258,0.451881,0.458452,0.54,0.61596
audi,0.490929,0.0,0.609899,0.732545,0.673146,0.402789,0.654571,0.550276,0.740625,0.218484
bmw,0.670331,0.609899,0.0,0.645337,0.791751,0.486543,0.767612,0.793921,0.694821,0.637692
chevrolet,0.70992,0.732545,0.645337,0.0,0.758509,0.49232,0.5916,0.457867,0.612,0.6018
honda,0.384681,0.673146,0.791751,0.758509,0.0,0.705038,0.252362,0.660944,0.25299,0.438745
infiniti,0.3258,0.402789,0.486543,0.49232,0.705038,0.0,0.362,0.562492,0.835385,0.557165
nissan,0.451881,0.654571,0.767612,0.5916,0.252362,0.362,0.0,0.689176,0.259701,0.380222
subaru,0.458452,0.550276,0.793921,0.457867,0.660944,0.562492,0.689176,0.0,0.712941,0.47672
toyota,0.54,0.740625,0.694821,0.612,0.25299,0.835385,0.259701,0.712941,0.0,0.482727
volkswagen,0.61596,0.218484,0.637692,0.6018,0.438745,0.557165,0.380222,0.47672,0.482727,0.0


In [15]:
# Plotting MDS plot
from sklearn import manifold
seed = np.random.RandomState(seed=3)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, 
      random_state=seed, dissimilarity="precomputed", n_jobs=1)
results = mds.fit(dissimilarity_matrix)
coords = results.embedding_

In [16]:
import matplotlib.pyplot as plt
plt.subplots_adjust(bottom = 0.1)
plt.scatter(
    coords[:, 0], coords[:, 1], marker = 'o'
    )
for label, x, y in zip(topBrandsList, coords[:, 0], coords[:, 1]):
    
    plt.annotate(
        label,
        xy = (x, y), xytext = (-10, 10),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'red', alpha = 0.5),
        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

In [17]:
do_lift(top10[0:5], attributes)

Unnamed: 0,handling,luxury,performance,power,speed
acura,1.4328,1.94219,1.40505,1.81142,1.55425
audi,1.54283,1.66249,1.3976,1.8135,1.38524
bmw,1.70861,1.33239,1.48973,1.52435,1.42564
honda,1.45616,1.77459,1.48473,1.75142,1.56848
toyota,2.12373,2.49616,1.44585,1.05062,1.25786
