# Data Preparation of Mined Product Reviews from Lululemon.com

In [53]:
import pandas as pd
import numpy as np
import string
import datetime
import re

import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 80

## 1. Load Data

In [54]:
cols = [
    'product_name',
    'product_category',
    'product_category_type',
    'product_price',
    'product_average_rating',
    'customer_username',
    'review_rating',
    'customer_location',
    'customer_athlete_type',
    'customer_age_range',
    'customer_body_type',
    'review_likes',
    'review_dislikes',
    'review_fit',
    'review_title',
    'review_date',
    'review_text',
    'response_date',
    'response_text',
    'review_helpful_count',
    'review_nothelpful_count'
]
df = pd.read_csv('mensrunning_lululemon.csv', names=cols)
pd.set_option('display.max_columns', df.shape[1])

In [55]:
print('Observations: {}'.format(df.shape[0]))
print('Features: {}'.format(df.shape[1]))
df.head()

Observations: 2418
Features: 21


Unnamed: 0,product_name,product_category,product_category_type,product_price,product_average_rating,customer_username,review_rating,customer_location,customer_athlete_type,customer_age_range,customer_body_type,review_likes,review_dislikes,review_fit,review_title,review_date,review_text,response_date,response_text,review_helpful_count,review_nothelpful_count
0,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,$68.00,2.9 out of 5,NYR26,5 out of 5,"STAMFORD, CT",SWEATY GENERALIST,18-24,ATHLETIC,,,,The best tshirt,2018-10-12,These are the most comfortable T-shirts. Wish you guys would make more color...,,,0,0
1,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,$68.00,2.9 out of 5,MJB23,1 out of 5,"CHICAGO, IL, USA",SWEATY GENERALIST,18-24,MUSCULAR,design,quality,,Very Poor Quality,2018-10-11,I purchased this Short Sleeve expecting it to be just like all the other one...,"October 11, 2018","Dear mjb23,\n\nThanks for reaching out and providing this feedback for us. W...",0,0
2,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,$68.00,2.9 out of 5,JKD123,1 out of 5,"CHICAGO, IL, USA",RUNNER,25-34,ATHLETIC,,"stretch, comfort",,Stretches When You Sweat,2018-10-08,I bought this shirt for my husband to run the Chicago Marathon. Upon startin...,"October 9, 2018","Hi Jkd123,\n\nThanks for reaching out and providing this feedback for us. I ...",0,0
3,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,$68.00,2.9 out of 5,SLASH,5 out of 5,"OVERLAND PARK, KS, USA",SWEATY GENERALIST,45-54,MUSCULAR,,,,Love Lulu,2018-10-08,I have several shirts and shorts I have purchased from Lululemon. Hands down...,,,0,0
4,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,$68.00,2.9 out of 5,BRANDONM19,4 out of 5,"HUTCHINSON, KS",SWEATY GENERALIST,18-24,ATHLETIC,sweat wicking material,,,Sweat wicking shirt,2018-10-03,Great for workouts.,,,0,0


## 2. Data Pre-Processing

### Pre-Process Data

In [56]:
# Convert to datetime objects # 
df['review_date'] = pd.to_datetime(df['review_date'], infer_datetime_format=True)
df['response_date'] = pd.to_datetime(df['response_date'], infer_datetime_format=True)

# Strip ratings
df['review_rating'] = df['review_rating'].apply(lambda x: re.split(' ', x)[0]).map(float)
df['product_average_rating'] = df['product_average_rating'].apply(lambda x: re.split(' ', x)[0]).map(float)

# Strip dollar signs
df['product_price'] = df['product_price'].str.slice(1).astype(float)

# Clean product names
df['product_name'] = df['product_name'].str.replace('\n', ' ')

# Clean review and response text
df['review_text'] = df['review_text'].str.replace('\n', ' ', )
df['response_text'] = df['response_text'].str.replace('\n', ' ')

### Fixing product_name and product_type mislabellings

In [57]:
# Find products with different product types 

grp_prod = df.groupby(['product_name', 'product_category_type'])['product_category_type'].describe().reset_index()
mismatched = grp_prod[grp_prod.duplicated(subset='product_name', keep=False)][['product_name', 'product_category_type']]
mismatched = mismatched['product_name'].unique()

In [58]:
# Replace mismatched productType with 'Hoodies'

X = df[ (df['product_name']==mismatched[0]) | (df['product_name']==mismatched[1]) ].index
df.loc[X, 'product_category_type'] = 'Hoodies'

### Removing Duplicate Entries

In [59]:
# Some products have different urls with the same reviews to them if there are styles of the product on sale
# There are also duplicates when a product is updated and reviews stay the same

df = df[-df.duplicated(subset=['review_text','customer_username'])].reset_index(drop=True)

### Missing Values

In [60]:
df[df.columns[df.isnull().any()]].isnull().sum()

customer_location         344
customer_athlete_type     346
customer_age_range        311
customer_body_type        312
review_likes              773
review_dislikes           939
review_fit               1548
review_title                8
response_date            1190
response_text            1190
dtype: int64

In [61]:
# Replace missing values where a customer/representative wouldn't have responded to a feature and fill with 'No Response'
df.fillna('No Response', inplace=True)
print('Missing Values: {}'.format(any(df.isnull().any())))

Missing Values: False


# 3. Feature Engineering

In [62]:
# Create columns for length of each review
df['review_length'] = df['review_text'].apply(lambda x: len(x))

In [63]:
# Create binary column for response (1) or not (0)
df['review_response'] = np.where(df['response_text'] == 'No Response', 0, 1)

In [64]:
df.head()

Unnamed: 0,product_name,product_category,product_category_type,product_price,product_average_rating,customer_username,review_rating,customer_location,customer_athlete_type,customer_age_range,...,review_fit,review_title,review_date,review_text,response_date,response_text,review_helpful_count,review_nothelpful_count,review_length,review_response
0,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,68.0,2.9,NYR26,5.0,"STAMFORD, CT",SWEATY GENERALIST,18-24,...,No Response,The best tshirt,2018-10-12,These are the most comfortable T-shirts. Wish you guys would make more color...,No Response,No Response,0,0,150,0
1,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,68.0,2.9,MJB23,1.0,"CHICAGO, IL, USA",SWEATY GENERALIST,18-24,...,No Response,Very Poor Quality,2018-10-11,I purchased this Short Sleeve expecting it to be just like all the other one...,2018-10-11 00:00:00,"Dear mjb23, Thanks for reaching out and providing this feedback for us. We ...",0,0,567,1
2,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,68.0,2.9,JKD123,1.0,"CHICAGO, IL, USA",RUNNER,25-34,...,No Response,Stretches When You Sweat,2018-10-08,I bought this shirt for my husband to run the Chicago Marathon. Upon startin...,2018-10-09 00:00:00,"Hi Jkd123, Thanks for reaching out and providing this feedback for us. I co...",0,0,269,1
3,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,68.0,2.9,SLASH,5.0,"OVERLAND PARK, KS, USA",SWEATY GENERALIST,45-54,...,No Response,Love Lulu,2018-10-08,I have several shirts and shorts I have purchased from Lululemon. Hands down...,No Response,No Response,0,0,170,0
4,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,68.0,2.9,BRANDONM19,4.0,"HUTCHINSON, KS",SWEATY GENERALIST,18-24,...,No Response,Sweat wicking shirt,2018-10-03,Great for workouts.,No Response,No Response,0,0,19,0


In [65]:
df.shape

(1859, 23)

We now have left 1859 unique product reviews.

### Creating a customer country column
- locations are either:
    - city/town, state(abbv)
    - city, province/territory
    - state(abbv)
    - city/town
    - town
- we can narrow this down by separating locations that:
    - 1) have a comma
    - 2) are just one string
        - one string that with length two that is either a:
            - US state abbv, or
            - Canadian providence/territory
            - some other location not in the US or Canada
        - one string that is either a:
            - city/town name
            - a US state
            - a Canadian providence/territory
            - some other location not in the US or Canada

In [66]:
# Create an empty customer_country series
df['customer_country'] = pd.Series()

In [67]:
# Locations with a comma
locations = df[df['customer_location'] != 'No Response']['customer_location']
commas = locations[locations.str.contains(', ')]
print('Values: {}'.format(len(commas)))
commas.sample(20)

Values: 1247


1603          ATLANTA, GA, USA
1776         NEW YORK, NY, USA
883          LEHIGH VALLEY, PA
1381         NEW YORK, NY, USA
1503        KNOXVILLE, TN, USA
230         ST. LOUIS, MO, USA
1803            SUCCASUNNA, NJ
810            NAPLES, FL, USA
658            CALIFORNIA, USA
942              ANN ARBOR, MI
423       PANAMA CITY, FL, USA
1523      SANTA CLARA, CA, USA
1564    KENORA, ON P0V, CANADA
280           HOUSTON, TX, USA
1120    SAN FRANCISCO, CA, USA
1363         NEWPORT BEACH, CA
243           BURBANK, CA, USA
128         KNOXVILLE, TN, USA
565           CHICAGO, IL, USA
1624            MIAMI, FL, USA
Name: customer_location, dtype: object

In [68]:
temp_countries_df = pd.DataFrame(columns = ['location', 'country'], index=commas.index)
temp_countries_df['location'] = [loc[-1] for loc in commas.str.split(', ')]
temp_countries_df.head()

Unnamed: 0,location,country
0,CT,
1,USA,
2,USA,
3,USA,
4,KS,


In [69]:
# US States
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

# Provinces/Territories in Canada
provs_terrs = {
    'AB': 'Alberta',
    'BC': 'British Columbia',
    'MB': 'Manitoba',
    'NB': 'New Brunswick',
    'NL': 'Newfoundland and Labrador',
    'NT': 'Northwest Territories',
    'NS': 'Nova Scotia',
    'NU': 'Nunavut',
    'ON': 'Ontario',
    'PE': 'Prince Edward Island',
    'QC': 'Quebec',
    'SK': 'Saskatchewan',
    'YT': 'Yukon'
}

In [70]:
usa = ['United States', 'United States of America', 'US', 'USA', 'UNITED STATES']
usa_condition = [i in usa for i in temp_countries_df['location']] # locations that are in the USA

usa_states_abbvs = states.keys()
usa_state_abbvs_condition = [i in usa_states_abbvs for i in temp_countries_df['location']] # locations that are state abbreviations in the USA

usa_states = [state.upper() for state in states.values()]
usa_state_condition = [i in usa_states for i in temp_countries_df['location']] # locations that are states in USA

canada = ['Canada', 'CANADA']
canada_condition = [i in canada for i in temp_countries_df['location']] # locations that are in Canada

canada_provs_terrs_abbvs = provs_terrs.keys()
canada_prov_terr_abbvs_condition = [i in canada_provs_terrs_abbvs for i in temp_countries_df['location']] # locations that are province/territory abbreviations in Canada

canada_provs_terrs = [prov_terr.upper() for prov_terr in provs_terrs.values()]
canada_prov_terr_condition = [i in canada_provs_terrs for i in temp_countries_df['location']] # locations that are provinces/territories in Canada


australia = ['AUS', 'Australia', 'AUSTRALIA']
australia_condition = [i in australia for i in temp_countries_df['location']] # locations that are in Australia

mexico = ['MEXICO', 'MEX']
mexico_condition = [i in mexico for i in temp_countries_df['location']] # locations that are in Mexico

uk = ['United Kingdom', 'UNITED KINGDOM', 'UK']
uk_condition = [i in uk for i in temp_countries_df['location']] # locations that are in the UK

philippines = ['Philippines', 'PHILIPPINES', 'PHL', 'PH']
philippines_condition = [i in philippines for i in temp_countries_df['location']] # locations that are in the Phillipines

In [71]:
temp_countries_df.loc[usa_condition, 'country'] = ['USA' for i in temp_countries_df['location'] if i in usa]
temp_countries_df.loc[usa_state_abbvs_condition, 'country'] = ['USA' for i in temp_countries_df['location'] if i in usa_states_abbvs]
temp_countries_df.loc[usa_state_condition, 'country'] = ['USA' for i in temp_countries_df['location'] if i in usa_states]

temp_countries_df.loc[canada_condition, 'country'] = ['Canada' for i in temp_countries_df['location'] if i in canada]
temp_countries_df.loc[canada_prov_terr_abbvs_condition, 'country'] = ['Canada' for i in temp_countries_df['location'] if i in canada_provs_terrs_abbvs]
temp_countries_df.loc[canada_prov_terr_condition, 'country'] = ['Canada' for i in temp_countries_df['location'] if i in canada_provs_terrs]

temp_countries_df.loc[australia_condition, 'country'] = ['Australia' for i in temp_countries_df['location'] if i in australia]

temp_countries_df.loc[mexico_condition, 'country'] = ['Mexico' for i in temp_countries_df['location'] if i in mexico]

temp_countries_df.loc[uk_condition, 'country'] = ['UK' for i in temp_countries_df['location'] if i in uk]

temp_countries_df.loc[philippines_condition, 'country'] = ['Philippines' for i in temp_countries_df['location'] if i in philippines]

In [72]:
temp_countries_df.loc[temp_countries_df['country'].isnull(), 'location'].unique()

array(['BC / LOS ANGELES', 'ONTATIO', 'EARTH',
       'BUT TRAVELING IN CENTRAL AND SOUTH AMERICA', 'CA USA'],
      dtype=object)

In [73]:
# Manually enter what is leftover ... can't win them all
temp_countries_df.loc[temp_countries_df['country'].isnull(), 'location']

359                               BC / LOS ANGELES
436                                        ONTATIO
734                                          EARTH
951     BUT TRAVELING IN CENTRAL AND SOUTH AMERICA
1323                                        CA USA
Name: location, dtype: object

In [74]:
temp_countries_df.loc[359]['country'] = 'Canada'

In [75]:
df.loc[436, 'customer_location'] = 'LONDON, ONTARIO'

In [76]:
temp_countries_df.loc[436]['country'] = 'Canada'

In [77]:
temp_countries_df.loc[734, 'country'] = 'USA'

In [78]:
print('Other products bought: {}'.format(df[df['customer_username'] == df.loc[951, 'customer_username']].shape[0] > 1))

Other products bought: False


In [79]:
temp_countries_df.loc[951, 'country'] = 'Canada'

In [80]:
temp_countries_df.loc[1323, 'country'] = 'USA'

In [81]:
temp_countries_df.isnull().sum()

location    0
country     0
dtype: int64

In [82]:
df['customer_country'] = temp_countries_df['country']

In [83]:
# Locations without a comma
no_commas = locations[-locations.str.contains(', ')]
print('Values: {}'.format(len(no_commas)))
no_commas.sample(20)

Values: 268


1339           ARIZONA
1519          GRAND RA
697            RALEIGH
391                NYC
749          SINGAPORE
1300        SEATTLE WA
1713          NEW YORK
1529                NH
1487         SINGAPORE
1678     MASSACHUSETTS
1854           CALGARY
279                NYC
79              DALLAS
915            CHICAGO
1473          SCOTLAND
345           VIRGINIA
945           VICTORIA
332            GEORGIA
1841         LAS VEGAS
317     SAINT LOUIS MO
Name: customer_location, dtype: object

In [84]:
# Make a new temp_countries_df to handle these locations
temp_countries_df = pd.DataFrame(columns = [ 'location',
                                            'split_location',
                                            'country'], 
                                 index=no_commas.index)
temp_countries_df['location'] = no_commas
temp_countries_df['split_location'] = [loc[-1] for loc in no_commas.str.split(' ')]
temp_countries_df.head()

Unnamed: 0,location,split_location,country
12,SAN FRANCISCO,FRANCISCO,
20,WISCONSIN,WISCONSIN,
27,DENVER COLORADO,COLORADO,
30,NJ,NJ,
40,PHILADELPHIA,PHILADELPHIA,


In [85]:
def check_location_condition(country_list):
    return [i in country_list for i in temp_countries_df['split_location']]

usa_condition = check_location_condition(usa)
usa_state_abbvs_condition = check_location_condition(usa_states_abbvs)
usa_state_condition = check_location_condition(usa_states)

canada_condition = check_location_condition(canada)
canada_prov_terr_condition = check_location_condition(canada_provs_terrs)
canada_prov_terr_abbvs_condition = check_location_condition(canada_provs_terrs_abbvs)

australia_condition = check_location_condition(australia)

mexico_condition = check_location_condition(mexico)

uk_condition = check_location_condition(uk)

philippines_condition = check_location_condition(philippines)

In [86]:
def create_country(condition, country, country_list):
    temp_countries_df.loc[condition, 'country'] = [country for i in temp_countries_df['split_location'] if i in country_list]
    return temp_countries_df

In [87]:
temp_countries_df = create_country(usa_condition, 'USA', usa)
temp_countries_df = create_country(usa_state_abbvs_condition, 'USA', usa_states_abbvs)
temp_countries_df = create_country(usa_state_condition, 'USA', usa_states)
temp_countries_df = create_country(canada_condition, 'Canada', canada)
temp_countries_df = create_country(canada_prov_terr_condition, 'Canada', canada_provs_terrs)
temp_countries_df = create_country(canada_prov_terr_abbvs_condition, 'Canada', canada_provs_terrs_abbvs)
temp_countries_df = create_country(australia_condition, 'Australia', australia)
temp_countries_df = create_country(mexico_condition, 'Mexico', mexico)
temp_countries_df = create_country(uk_condition, 'UK', uk)
temp_countries_df = create_country(philippines_condition, 'Philippines', philippines)

In [88]:
# Examine original locations
leftovers = temp_countries_df[temp_countries_df['country'].isnull()]['location']
leftovers.unique()

array(['SAN FRANCISCO', 'PHILADELPHIA', 'MIAMI', 'DALLAS', 'CHICAGO',
       'DENVER', 'NEW YORK', 'BOSTON', 'SAN DIEGO', 'OTTAWA', 'HOUSTON',
       'DETROIT', 'SCOTTSDALE', 'ST. LOUIS', 'SPOKANE', 'NYC', 'PHOENIX',
       'UNITED STATES', 'ONTAIRO', 'LOS ANGELES', 'KANSAS CITY',
       'MANHATTAN', 'ONLINE', 'SEATTLE', 'NEW JERSEY', 'TORONTO',
       'WASHINGTON D.C.', 'SOUTH EAST ASIA', 'CLEVELAND', 'NOCAL',
       'SINGAPORE', 'NORTH CAROLINA', 'TAIWAN', 'MEXICO CITY', 'RALEIGH',
       'LONG BEACH', 'MINNEAPOLIS', 'AUSTIN', 'PACIFIC NW',
       'RIVERHEAD NEW YORK', 'AUSTIB', 'VICTORIA', 'VANCOUVER',
       'SACRAMENTO', 'BAY AREA', 'SOUTH KOREA', 'NANAIMO', 'DFW',
       'BOSTON,MA', 'WASHINGTON STATE', 'OC', 'CALGARY',
       'LAC DU BONNET,MB', 'ALL OVER', 'NORTHEAST', 'HONG KONG',
       'WINNIPEG', 'CINCINNATI,OHIO', 'NEW BRUNSWICK', 'LONG ISLAND',
       'BROOKLYN', 'ATLANTA', 'ALL OVER THE COUNTRY', 'SCOTLAND',
       'NEWFOUNDLAND', 'GRAND RA', 'SALT LAKE CITY', 'FORT MCMU

In [89]:
def manual_condition(location):
    return [loc == location for loc in temp_countries_df['location']]
def create_manual_country(condition, country):
    temp_countries_df.loc[condition, 'country'] = country
    return temp_countries_df

In [90]:
not_us = [
    'OTTAWA',
    'ONTAIRO',
    'TORONTO',
    'SOUTH EAST ASIA',
    'SINGAPORE',
    'TAIWAN',
    'MEXICO CITY',
    'VICTORIA',
    'VANCOUVER',
    'SOUTH KOREA',
    'NANAIMO',
    'CALGARY',
    'LAC DU BONNET,MB',
    'HONG KONG',
    'WINNIPEG',
    'SCOTLAND',
    'NEWFOUNDLAND',
    'FORT MCMURRAY',
    'HALIFAX']

In [91]:
us_places = [place for place in leftovers.unique() if place not in not_us]

In [92]:
for i in us_places:
    create_manual_country(manual_condition(i), 'USA')

In [93]:
canada_places = [
    'OTTAWA',
    'ONTAIRO',
    'TORONTO',
    'VICTORIA',
    'VANCOUVER',
    'NANAIMO',
    'CALGARY',
    'LAC DU BONNET,MB',
    'WINNIPEG',
    'NEWFOUNDLAND',
    'FORT MCMURRAY'
]

In [94]:
for i in canada_places:
    create_manual_country(manual_condition(i), 'Canada')

In [95]:
temp_countries_df[temp_countries_df['country'].isnull()]

Unnamed: 0,location,split_location,country
487,SOUTH EAST ASIA,ASIA,
559,SINGAPORE,SINGAPORE,
650,TAIWAN,TAIWAN,
661,MEXICO CITY,CITY,
749,SINGAPORE,SINGAPORE,
1000,SOUTH KOREA,KOREA,
1278,HONG KONG,KONG,
1473,SCOTLAND,SCOTLAND,
1487,SINGAPORE,SINGAPORE,
1626,SOUTH EAST ASIA,ASIA,


In [96]:
create_manual_country(manual_condition('SOUTH EAST ASIA'), 'South East Asia')
create_manual_country(manual_condition('SINGAPORE'), 'Singapore')
create_manual_country(manual_condition('TAIWAN'), 'Taiwan')
create_manual_country(manual_condition('MEXICO CITY'), 'Mexico')
create_manual_country(manual_condition('SOUTH KOREA'), 'South Korea')
create_manual_country(manual_condition('HONG KONG'), 'Hong Kong')
create_manual_country(manual_condition('SCOTLAND'), 'Scotland')
create_manual_country(manual_condition('HALIFAX'), 'UK')

Unnamed: 0,location,split_location,country
12,SAN FRANCISCO,FRANCISCO,USA
20,WISCONSIN,WISCONSIN,USA
27,DENVER COLORADO,COLORADO,USA
30,NJ,NJ,USA
40,PHILADELPHIA,PHILADELPHIA,USA
51,MIAMI,MIAMI,USA
53,US,US,USA
56,VERMONT,VERMONT,USA
68,CALIFORNIA,CALIFORNIA,USA
78,PA,PA,USA


In [97]:
df.loc[temp_countries_df.index, 'customer_country'] = temp_countries_df['country']

In [99]:
df['customer_country'].fillna('No Response', inplace=True)

In [103]:
any(df.isnull().any())

False

In [104]:
df.to_csv('cleaned_mensrunning_lululemon.csv')

In [105]:
df.head()

Unnamed: 0,product_name,product_category,product_category_type,product_price,product_average_rating,customer_username,review_rating,customer_location,customer_athlete_type,customer_age_range,...,review_title,review_date,review_text,response_date,response_text,review_helpful_count,review_nothelpful_count,review_length,review_response,customer_country
0,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,68.0,2.9,NYR26,5.0,"STAMFORD, CT",SWEATY GENERALIST,18-24,...,The best tshirt,2018-10-12,These are the most comfortable T-shirts. Wish you guys would make more color...,No Response,No Response,0,0,150,0,USA
1,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,68.0,2.9,MJB23,1.0,"CHICAGO, IL, USA",SWEATY GENERALIST,18-24,...,Very Poor Quality,2018-10-11,I purchased this Short Sleeve expecting it to be just like all the other one...,2018-10-11 00:00:00,"Dear mjb23, Thanks for reaching out and providing this feedback for us. We ...",0,0,567,1,USA
2,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,68.0,2.9,JKD123,1.0,"CHICAGO, IL, USA",RUNNER,25-34,...,Stretches When You Sweat,2018-10-08,I bought this shirt for my husband to run the Chicago Marathon. Upon startin...,2018-10-09 00:00:00,"Hi Jkd123, Thanks for reaching out and providing this feedback for us. I co...",0,0,269,1,USA
3,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,68.0,2.9,SLASH,5.0,"OVERLAND PARK, KS, USA",SWEATY GENERALIST,45-54,...,Love Lulu,2018-10-08,I have several shirts and shorts I have purchased from Lululemon. Hands down...,No Response,No Response,0,0,170,0,USA
4,Metal Vent Tech Short Sleeve,Tops,Short Sleeves,68.0,2.9,BRANDONM19,4.0,"HUTCHINSON, KS",SWEATY GENERALIST,18-24,...,Sweat wicking shirt,2018-10-03,Great for workouts.,No Response,No Response,0,0,19,0,USA


In [None]:
|