In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df = pd.read_csv("/kaggle/input/glassdoor-company-insightsscraped-data-collection/glassdoor_comany.csv", 
                 encoding='cp1252')

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.fillna("Unknown", inplace=True)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
for i in df.columns:
    print(i, " ", df[i].nunique())

# Preprocessing Location feature

We extract amount of offices from string values, like "n office locations in United States"

If there are other values(address of one office) we return 1 - which means there is at least one office in United States

In [None]:
def location_process(x):
    if "office locations" in x:
        return int(x.split()[0])
    else:
        return 1

In [None]:
df['offices'] = df['Location'].apply(location_process)

In [None]:
df['salaries_in_K'] = df['Company salaries'].apply(lambda x: float(x[:-1]) if "K" in x else float(x))
df['reviews_in_K'] = df['Company reviews'].apply(lambda x: float(x[:-1]) if "K" in x else float(x))
df['jobs_in_K'] = df['Company Jobs'].apply(lambda x: float(x[:-1]) if "K" in x else float(x))

# Pairplot to take a closer look at how data is distributed

In [None]:
sns.pairplot(df, vars=['salaries_in_K', 'reviews_in_K', 'jobs_in_K', 'Company rating'])

# Histograms grouped by number of employees categories showing data distribution

In [None]:
def hists(x):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    sns.histplot(df, x=x, ax=axes[0], kde=True, color='r')
    sns.histplot(df, x=x, ax=axes[1], kde=True, hue='Number of Employees')
    plt.show()

In [None]:
for i in ['salaries_in_K', 'reviews_in_K', 'jobs_in_K', 'Company rating']:
    hists(i)

# Preparing data for recommendation system

In [None]:
df['overview'] = df['Company Description'] + " " + df['Industry Type'] + " " + df['Number of Employees'] + " " + df['Location']
df['overview'] = df['overview'].apply(lambda x: x.lower())

# Vectorization and fitting the data

In [None]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df['overview']).toarray()

In [None]:
similarity = cosine_similarity(vectors)

# Recommendation function

In [None]:
def similar_company(name):
    indices = df[df['Company Name'] == name].index[0]
    distances = similarity[indices]
    arr = sorted(list(enumerate(distances)), reverse = True, key=lambda x: x[1])[1:8]
    
    for i in arr:
        print(df.loc[i[0], 'Company Name'])

In [None]:
similar_company('Amazon')