In [None]:
from glob import glob
import numpy as np
import pandas as pd

# Load RAW data

In [None]:
def load_data(data_dir: str = "raw_data/", verbose: bool = False) -> pd.DataFrame:
    dfs = []
    c = 0
    for file in glob(data_dir + "Reviews-*.parquet"):
        if verbose:
            print(f"Reading in:   {file}")
        dfs.append(pd.read_parquet(file))
        c += 1

    if verbose:
        print(f"Loaded {c} files")

    concat_df = pd.concat(dfs).drop_duplicates(keep="first")
    return concat_df.reset_index().iloc[:, 1:]
df = load_data()

# Transform / Process Data

In [None]:
def remove_whitespace(text: str):
    """Removes first character if it is a whitespace"""
    if text[0] == " ":
        return text[1:]
    return text


def transform_job_titles(job_title: str):
    """Puts a whitespace if non between rank and title"""
    if job_title.endswith("I") or job_title.endswith("V"):
        l = len(job_title)
        for i, c in enumerate(reversed(job_title)):
            if c not in ["V", "I", " "]:
                return job_title[0 : l - i] + " " + job_title[l - i :]
    else:
        return job_title
    
def transform_status(status: str):
    status_mapping = {
        "KEY NOT FOUND: jobLine.seasonal-current": "Current Seasonal",
        "KEY NOT FOUND: jobLine.seasonal-former": "Former Seasonal",
        "KEY NOT FOUND: jobLine.self_employ-former": "Former Self Employed",
        "KEY NOT FOUND: jobLine.self_employ-current": "Current Self Employed",
        "KEY NOT FOUND: jobLine.reserve-current": "Current Reserve",
        "KEY NOT FOUND: jobLine.reserve-former": "Former Reserve",
        "KEY NOT FOUND: jobLine.per_diem-current": "Current Per Diem",
        "KEY NOT FOUND: jobLine.per_diem-former": "Former Per Diem",
    }
    if status in status_mapping.keys():
        return status_mapping[status]
    else:
        return status


def transform_helpful_column(text: str):
    """Transforms the helpful column to a number"""
    text = text.replace("\xa0", "")
    if text == "Be the first to find this review helpful":
        return 0
    elif "people" in text:
        return int(text.replace("people found this review helpful", ""))
    elif "person" in text:
        return int(text.replace("person found this review helpful", ""))

# Transforming the data
df = df.assign(
    rating=pd.to_numeric(df["rating"]).astype(np.int8),
    status=df["status"]
    .apply(lambda x: transform_status(x))
    .astype("category"),
    date=pd.to_datetime(df["date"]),
    experience=df["experience"].astype("category"),
    company=df["company"].astype("category"),
    category=df["category"].astype("category"),
    helpful=df["helpful"].apply(lambda x: transform_helpful_column(x)).astype(np.int16),
    job_title=df["job_title"]
    .apply(lambda x: x.replace("\xa0", " "))
    .apply(lambda x: remove_whitespace(x))
    .apply(lambda x: transform_job_titles(x)),
)

### Job Rank Cleaning

In [None]:
potential_ranks = ["i", ]
def extract_job_title_rank(job_title: str):
    """Extracts the rank from the job title"""
    job_ranks = ["Senior", "Sr.", "Junior", "Jr.", "sr"]

    junior =  ["i"]
    #junior to lowercase
    junior = [x.lower() for x in junior]

    senior = ["iv", "v", "IV", "V", "iv",  "sr", "sr.", "sre"]
    #senior to lowercase
    senior = [x.lower() for x in senior]

    prejunior_levels = ["apprentice","apprenticeship","internship", "intern", "(Internship)", "sdeintern","volunteer"]
    #prejunior_levels to lowercase
    prejunior_levels = [x.lower() for x in prejunior_levels]

    junior_levels = ["l 1", "t 1",  "tier 1",  "level 1", "(sde1)", "entry level", "graduate", "jr.", "junior", "l1", "level1", "sd1", "sde2", "se1", "t1", "t1/t2", "tier1"]
    #junior_levels to lowercase
    junior_levels = [x.lower() for x in junior_levels]

    intermediate_levels = ["l 2", "t 2", "tier 2", "level 2", "(sdeii)", "ii", "ii)", "ii,", "intermediate", "l2", "level2", "sd2", "sde2", "t2", "tier2", "II", "ii"]
    #intermediate_levels to lowercase
    intermediate_levels = [x.lower() for x in intermediate_levels]
    senior_levels = ["l 3", "l 4", "l 5", "l 6", "t 3", "t 4", "t 5", "t 6", "t 7", "tier 3", "tier 4", "tier 5", "tier 6", "tier 7", "level 3", "level 4", "level 5", "level 6", "level 7", "(e3)", "(ic3)", "(ic4)", "(l3)", "(l7)", "(sde3)", "(se2)", "e(senior", "head", "ic5", "ict4", "ict5", "ii/senior", "iii", "iii)", "iii,", "III", "iii/technical", "l3", "l4", "l5", "l6", "l7", "level4", "level5", "level6", "level7", "lead", "lead,", "level3", "principal", "principal,", "principle", "pro/lead", "leads", "senior", "seniors", "seniot", "senor", "sre/devops", "srspec",  "sr.financial"]
    #senior_levels to lowercase
    senior_levels = [x.lower() for x in senior_levels]

    management_levels = ["vice pres", "(csa)", "assoc", "assocaie", "associate", "associate,", "associate1", "associateenterprise", "associatena", "associatw", "captain", "chef", "chief", "executiove", "executive", "executive,", "executivre", 'cso/ciso"', "dceo", "director", "director,", "direrctor", "management", "manager", "manager%2c", "manager)", "manager,", "manager/director", "manager/engineer", "manager/senior", "meister/data", "officer", "president", "president;", "vp"]
    #management_levels to lowercase
    management_levels = [x.lower() for x in management_levels]

    rank = "No Rank"

    job_title_keywords = job_title.split(" ")

    # check if any of the words in job_title_keywords are in prejunior
    listed_rank = ""
    for word in job_title_keywords:
        kword = word.strip().lower()
        if kword in junior:
            rank = "Junior"
            listed_rank = word.strip()
            break
        elif kword in senior:
            rank = "Senior"
            listed_rank = word.strip()
            break
    
    if listed_rank == "":
        for p_levels in prejunior_levels:
            if p_levels.lower().strip() in job_title.lower().strip():
                rank = "Prejunior"
                listed_rank = p_levels
                break
        for j_levels in junior_levels:
            if j_levels.lower().strip() in job_title.lower().strip():
                rank = "Junior"
                listed_rank = j_levels
                break
        for i_levels in intermediate_levels:
            if i_levels.lower().strip() in job_title.lower().strip():
                rank = "Intermediate"
                listed_rank = i_levels
                break
        for s_levels in senior_levels:
            if s_levels.lower().strip() in job_title.lower().strip():
                rank = "Senior"
                listed_rank = s_levels
                break
        for m_levels in management_levels:
            if m_levels.lower().strip() in job_title.lower().strip():
                rank = "Management"
                listed_rank = m_levels
                break
            
    return [job_title.replace(listed_rank, "").strip(), rank]
    
# Split job_title into title and rank column
df[["job_title", "job_rank"]] = (
    df["job_title"].apply(extract_job_title_rank).apply(pd.Series)
)

### Location

In [None]:
import geonamescache

gc = geonamescache.GeonamesCache()
cities = pd.DataFrame.from_dict(gc.get_cities(), orient="index")
countries = pd.DataFrame.from_dict(gc.get_countries(), orient="index")
us_states = pd.DataFrame.from_dict(gc.get_us_states(), orient="index")

# location_state column
df['location'] = df['location'].str.replace('in ', '') 
df['location_us_state'] = df['location'].str.split(', ').str[1].dropna().str.upper() 
df['location_us_state'] = df['location_us_state'].apply(lambda x: x if x in us_states['code'].tolist() else None) 
df['location'] = df['location'].str.replace(',\s\D\D$', '', regex=True)

# location_country column
cities = cities.sort_values(by='population') 
country_abbrev_dict = countries['name'].to_dict() 
cities['countrycode'] = cities['countrycode'].replace(country_abbrev_dict) 
cities = cities.set_index('name')
city_to_country_dict = cities['countrycode'].to_dict()
df['location'] = df['location'].str.split(', ').str[0]
df['location_country'] = df['location'].replace(city_to_country_dict)

# fixing location_country column and creating a list of countries that are missing
indexes_missing = list(df[df['location_country'] == df['location']].index)
unique_countries_missing = df['location_country'][indexes_missing].value_counts().index.tolist() ### City name were not known in city_to_country_dict, therefore the city and country are the same
dictvalues = []
conflicting_cities = []
for i in unique_countries_missing: ### Search through prev mentioned list and see if the known city name is an alternative name and append the actual country name to a list. If not, append the city name to a list
    try:
        gc.search_cities(i)[1]['name']
        conflicting_cities.append(i)
        dictvalues.append(i)
    except IndexError:
        try:
            dictvalues.append(gc.search_cities(i)[0]['name'])
        except IndexError:
            dictvalues.append(i)

missing_city_to_country_dict = dict(zip(unique_countries_missing,dictvalues)) ### Create a dictionary with the city name as key and the country name as value
df['location'] = df['location'].replace(missing_city_to_country_dict)
df['location_country'] = df['location'].replace(city_to_country_dict) 
df.loc[~df['location_country'].isin(countries.name).dropna(), 'location_country'] = None ### Drop all rows where the country is not in the list of countries
df.loc[df['location_us_state'].notna(), 'location_country'] = 'United States' ### If the state is known, the country is the US

#manual country imputation
df.loc[df['location'].isin(['Vāranāsi', 'Ma.Kunnathur', 'North Goa', 'Sri Potti Sriramulu Nellore', 'Bangalore Rural', 'North Twenty Four Parganas', 'Bhetasi Ba Bhag', 'Naugaon' 'Kanpur Nagar', 'Āmer', 'South Tripura', 'Patan-Veraval', 'Abujhati', 'Grant No 11', 'Kanpur Dehat', 'Sant Kabir Nagar', 'Paschim Medinipur', 'Nalbāri', 'Lakshadweep', 'Madhāpur', 'Jangareddigudem', 'Andheri East']), 'location_country'] = 'India'
df.loc[df['location'].isin(['Ang Mo Kio New Town', 'Bedok New Town', 'Jurong West New Town', 'Bukit Merah Estate', 'Novena', 'Yishun New Town', 'Tampines New Town', 'Marina South', 'Bukit Batok New Town']), ['location', 'location_country']] = ['Singapore', 'Singapore'] ### Areas in Singapore
df.loc[df['location'].isin(['Greenhithe', 'Twycross', 'Bromley', 'Wiggenhall Saint Germans', 'Sydenham', 'Newcastle Upon Tyne', 'Hendon', 'London Colney']), 'location_country'] = 'United Kingdom'
df.loc[df['location'].isin(['Al Ḩajarayn', 'Al Mariah United Group', 'Global Village']), ['location', 'location_country']] = ['Dubai', 'United Arab Emirates']
df.loc[df['location'].isin(['Kwun Tong', 'Mui Wo Kau Tsuen', 'Causeway Bay']), ['location', 'location_country']] = ['Hong Kong', 'Hong Kong']
df.loc[df['location'].isin(['Lachine', 'Chinook Cove', 'Etobicoke', 'Whistler', 'Brisco', 'Bowen Island']), 'location_country'] = 'Canada'
df.loc[df['location'].isin(['Torono', 'North Toronto','Midtown Toronto']),['location', 'location_country']] = ['Toronto', 'Canada']
df.loc[df['location'].isin(['Yoqne‘Am ‘Illit','Yoqne`Am', 'Yoqne`am', 'Ra`ananna']), ['location','location_country']] = ['Yoqne‘Am ‘Illit', 'Israel']
df.loc[df['location'].isin(['North Ryde', 'Chadstone', 'Chermside', 'Youngs Crossing']), 'location_country'] = 'Australia'
df.loc[df['location'].isin(['Frankfurt Am Main', 'Grünheide (Mark)', 'Lustadt']), 'location_country'] = 'Germany'
df.loc[df['location'].isin(['Portlaoise', 'Monkstown', 'Midleton', 'Dunboyne']), 'location_country'] = 'Ireland'
df.loc[df['location'].isin(['Pudong', 'Shanhai']), ['location', 'location_country']] = ['Shanghai', 'China'] ### Area in Shanghai and misspelling.
df.loc[df['location'].isin(['Vedbæk', 'Kongens Lyngby', 'Lyngby', 'Gårde']), 'location_country'] = 'Denmark'
df.loc[df['location'].isin(['Ciudad De Mexico', 'Oaxaca De Juárez']), 'location_country'] = 'Mexico'
df.loc[df['location'].isin(['Herzliyya B', 'Ra`Ananna']), 'location_country'] = 'Israel'

df.loc[df['location'] == 'Newyork',      ['location', 'location_us_state', 'location_country']] = ['New York City', 'NY', 'United States'] 
df.loc[df['location'] == 'SanFrancisco', ['location', 'location_us_state', 'location_country']] = ['San Francisco', 'CA', 'United States']
df.loc[df['location'] == 'Laf',          ['location', 'location_us_state', 'location_country']] = ['Lafayette', 'LA', 'United States']

df.loc[(df['location'] == 'Pyrmont') & (df['company'] == 'Google'), ['location', 'location_country']] = ['Sydney', 'Australia'] ### Area in Sydney, where Google has a campus.
df.loc[(df['location'] == 'Hillsdale') & (df['company'] == 'Apple'), ['location_us_state', 'location_country']] = ['NJ', 'United States'] ### The other person who worked at Hillsdale was from NJ, USA.

df.loc[df['location'] == 'Sparrows Point',           ['location_us_state', 'location_country']] = ['MD', 'United States']

df.loc[df['location'] == 'Ḩawwārah',                 ['location', 'location_country']] = ['Amman', 'Jordan'] ### Area in Amman
df.loc[df['location'] == 'Surat City',               ['location', 'location_country']] = ['Surat', 'India']
df.loc[df['location'] == "St. John'S",               ['location', 'location_country']] = ["St. John's", 'Canada']
df.loc[df['location'] == "Montréal-Ouest",           ['location', 'location_country']] = ["Montréal", 'Canada']
df.loc[df['location'] == "Manil",                    ['location', 'location_country']] = ["Manila", 'Philippines']
df.loc[df['location'] == "Yaba",                     ['location', 'location_country']] = ["Lagos", 'Nigeria']
df.loc[df['location'] == 'Newcastle',                ['location', 'location_country']] = ['Newcastle Upon Tyne', 'United Kingdom']
df.loc[df['location'] == 'Manchester City Centre',   ['location', 'location_country']] = ['Manchester', 'United Kingdom']
df.loc[df['location'] == 'Covent Garden',            ['location', 'location_country']] = ['London', 'United Kingdom'] ### Area in London
df.loc[df['location'] == 'Kuala Lumpur City Centre', ['location', 'location_country']] = ['Kuala Lumpur', 'Malaysia']
df.loc[df['location'] == 'Knocknaheeny',             ['location', 'location_country']] = ['Cork', 'Ireland']
df.loc[df['location'] == 'Pradera Chica',            ['location', 'location_country']] = ['Quito', 'Ecuador']

df.loc[df['location'] == 'Reggio Di Calabria',  'location_country'] = 'Italy'
df.loc[df['location'] == 'Issy-Les-Moulineaux', 'location_country'] = 'France'
df.loc[df['location'] == 'Schiphol',            'location_country'] = 'Netherlands'
df.loc[df['location'] == 'Watreso',             'location_country'] = 'Ghana'
df.loc[df['location'] == 'Bangloma',            'location_country'] = 'Democratic Republic of the Congo'
df.loc[df['location'] == "Aţ Ţā'If",            'location_country'] = 'Saudi Arabia'
df.loc[df['location'] == "Qūnah",               'location_country'] = 'Egypt'
df.loc[df['location'] == "Rajbari",             'location_country'] = 'Bangladesh'
df.loc[df['location'] == 'Haidian',             'location_country'] = 'China'
df.loc[df['location'] == 'Jāwā',                'location_country'] = 'Jordan'
df.loc[df['location'] == 'Polska',              'location_country'] = 'Poland'
df.loc[df['location'] == 'Jabinyānah',          'location_country'] = 'Tunisia'
df.loc[df['location'] == 'La Jina',             'location_country'] = 'The Dominican Republic'
df.loc[df['location'] == 'Arifiye',             'location_country'] = 'Turkey'
df.loc[df['location'] == 'Rostov-Na-Donu',      'location_country'] = 'Russia'
df.loc[df['location'] == 'Gujrāt',              'location_country'] = 'Pakistan'

#manual conflicting_cities country imputation
df.loc[df['location'].isin(['Bān','Ban','Benga', 'ben']), ['location', 'location_country']] = ['Bengaluru', 'India'] ### Ban is another way to spell Bengaluru
df.loc[df['location'] == 'Quebec', 'location_country'] = 'Canada' 
df.loc[df['location'] == 'Belgrad', 'location_country'] = 'Serbia' ### Belgrad is another way to spell Belgrade
df.loc[(df['location'] == 'Charlestown') & (df['company'] == 'Apple'), 'location_country'] = 'Australia' ### Apple has a location in Australia called Charlestown, which has an Apple Store
df.loc[df['location'] == 'Mirpur', 'location_country'] = 'Pakistan'

#manual us states imputation
df.loc[df['location'] == 'San Jose', ['location_us_state', 'location_country']] = ['CA', 'United States'] 
costarica = 'costa|rica|costa rica|rican|costa rican|ricans|costa ricans|asoamazon' ### If the word costa or rica etc. is in the pros, cons or review_title, the country is Costa Rica, otherwise it will later be assigned as San Jose, CA, USA.
df.loc[(df['location'] == 'San Jose') & ((df['pros'].str.contains(costarica)) | (df['cons'].str.contains(costarica)) | (df['review_title'].str.contains(costarica))), ['location_us_state', 'location_country']] = [None, 'Costa Rica']

df.loc[(df['location']=='Durham') & (df['location_country'] == 'United States'), 'location_us_state'] = 'NC'
df.loc[(df['location']=='San Francisco') & (df['location_us_state'].isna()), 'location_us_state'] = 'CA' ### There is one that is from San Francisco, MO, but there are over 1k from CA, so I'll just assume that they are from CA. Otherwise they would probably have specified the state.
df.loc[(df['location']=='Limerick') & (df['location_us_state'].isna()), 'location_country'] = 'Ireland' ### Missings are from Apple and Google and they have offices in Limerick, Ireland, also the most populated.
df.loc[(df['location']=='Waterford') & (df['location_us_state'].isna() & (df['company'] == 'Apple')), 'location_country'] = 'Ireland' ### Missing are all from Apple and they have offices in Waterford, Ireland, also the most populated.
df.loc[(df['location']=='Stratford') & (df['company'] == 'Apple'), 'location_country'] = 'United Kingdom' ### Missing are all from Apple and they have offices in Stratford.
df.loc[(df['location']=='Los Angeles') & (df['location_us_state'].isna()), 'location_us_state'] = 'CA' ### There is one that is from San Francisco, MO, but there are over 1k from CA, so I'll just assume that they are from CA. Otherwise they would probably have specified the state.
df.loc[(df['location']=='Los Angeles') & (df['company'] == 'Amazon'), 'location_us_state'] = 'CA' ### Missing are all from Amazon and they have offices in London, UK.
df.loc[(df['location']=='Eastvale') & (df['company'] == 'Amazon'), 'location_us_state'] = 'CA' ### Missing are all from Amazon and they have offices in Eastvale, CA.
df.loc[(df['location']=='Spokane Valley'), 'location_us_state'] = 'WA' ### Only Spokane Valley is in WA.
df.loc[(df['location']=='Mountain View') & (df['location_us_state'].isna()), 'location_us_state'] = 'CA' ### 1.2k in total, 99% from CA.
df.loc[(df['location']=='Enfield') & (df['location_us_state'].isna()), 'location_country'] = 'United Kingdom' ### London borough
df.loc[(df['location']=='University Park') & (df['company'] == 'Amazon') & (df['location_us_state'].isna()), 'location_us_state'] = 'IL' ### Amazon has offices in University Park, IL.
df.loc[(df['location']=='Saugus') & (df['company'] == 'Apple') & (df['location_us_state'].isna()), 'location_us_state'] = 'MA' ### Apple has offices close to Saugus, MA.
df.loc[(df['location']=='Homewood') & (df['company'] == 'Google') & (df['location_us_state'].isna()), 'location_us_state'] = 'IL' ### Homewood is a Chicago, IL suburb, where Google has offices.
df.loc[(df['location']=='Aurora') & (df['company'] == 'Google') & (df['location_us_state'].isna()), 'location_us_state'] = 'IL' ### Aurora is a Chicago, IL suburb, where Google has offices.
df.loc[(df['location']=='Salinas') & (df['location_us_state'].isna()), 'location_us_state'] = 'CA' ### Salinas is close to Silicon Valley, CA.
df.loc[(df['location']=='Chicago') & (df['location_us_state'].isna()), 'location_us_state'] = 'IL'
df.loc[(df['location']=='Goodyear') & (df['location_us_state'].isna()), 'location_us_state'] = 'AZ' ### Goodyear is close to Phoenix, AZ where there are multiple FAANG offices.
df.loc[(df['location']=='Worcester') & (df['location_us_state'].isna()), 'location_us_state'] = 'MA' ### All Worcester review in the dataset are from Massachussets.
df.loc[(df['location']=='Jurupa Valley') & (df['location_us_state'].isna()), 'location_us_state'] = 'CA' ### All Jurupa Valley review in the dataset are from CA.
df.loc[(df['location']=='Burton') & (df['location_us_state'].isna()), 'location_us_state'] = 'IL' ### Highest populated "Burton" in USA. And nearby cities work for Microsoft, since they're close to Detroit, MI.

df.loc[(df['location_us_state'].isna()) & (df['location_country']=='United States'), 'location_country'] = None ### If we can't specify a state, we can't be sure what country they are from.

# Sentiment analysis

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from scipy import special
from tqdm.auto import tqdm

In [None]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
def truncate(pros, cons):
    if len(tokenizer.tokenize(str(pros) + str(cons))) <= 512:
        return pros + " " + cons
    while len(tokenizer.tokenize(str(pros) + str(cons))) >= 512:
        if len(tokenizer.tokenize(str(pros))) > len(
            tokenizer.tokenize(str(cons))
        ):  # If pros longer than cons
            pros = pros.split(". ")[:-1]
            pros = ". ".join(pros)
        if len(tokenizer.tokenize(str(pros))) < len(
            tokenizer.tokenize(str(cons))
        ) or len(tokenizer.tokenize(str(pros))) == len(
            tokenizer.tokenize(str(cons))
        ):  # If cons longer than pros
            cons = cons.split(". ")[:-1]
            cons = ". ".join(cons)
    return pros + " " + cons


def truncate_single(sentence):
    if len(tokenizer.tokenize(str(sentence))) <= 512:
        return sentence
    while len(tokenizer.tokenize(str(sentence))) >= 512:
        sentence = sentence.split(". ")[:-1]
        sentence = ". ".join(sentence)
    return sentence

In [None]:
df["pros_and_cons"] = df.apply(lambda x: truncate(x["pros"], x["cons"]), axis=1)
df["advice"] = df["advice"].fillna("")
df["review_title"] = df["review_title"].fillna("")
df["pros_and_cons"] = df.apply(lambda x: truncate_single(x["review_title"] + ". " + x["pros_and_cons"] + " " + x["advice"]), axis=1)

df["pros_and_cons"] = df['pros_and_cons'].astype('str')

In [None]:
sentimentRating = []
for i in tqdm(range(0, len(df))):
    batch = tokenizer(
        df["pros_and_cons"][i : i + 1].tolist(), return_tensors="pt", padding="longest"
    )
    with torch.no_grad():
        outputs = model(**batch)
        sentimentRating.extend(1 + np.argmax(outputs.logits, axis=1))

In [None]:
df = df.sort_values(by=['date'], ascending=False)
df = df.reset_index(drop=True)
df.insert(loc=1, column="sentiment", value=sentimentRating)
df['sentiment'] = df['sentiment'].replace({1: 'Negative', 2: 'Neutral', 3: 'Positive'})
df['sentiment'] = df['sentiment'].astype('category').str.title()

df = df.drop(columns=["pros_and_cons"])
df = df.rename(columns={'helpful': 'helpful_count', 'location_country' : 'country', 'location_us_state' : 'us_state', 'location' : 'city'})
df = df[['date', 'review_title', 'pros', 'cons', 'advice', 'rating', 'sentiment', 'job_title', 'job_rank', 'company', 'status' ,'experience', 'helpful_count', 'category', 'city', 'us_state', 'country']]

df.to_csv('data.csv', index=False)