In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('job_postings.csv', sep=',')
df = df[df['views'] > 50]
#df = df.drop(columns=['skills_desc','scraped','min_salary', 'med_salary','max_salary','job_id', 'company_id', 'pay_period', 'formatted_work_type', 'location', 'original_listed_time', 'remote_allowed', 'job_posting_url', 'application_url', 'application_type', 'expiry', 'closed_time', 'formatted_experience_level', 'listed_time', 'posting_domain', 'sponsored', 'work_type', 'currency', 'compensation_type'])
#df = df.dropna()
#drop columns with more than 50% missing values
df = df.dropna(thresh=0.5*len(df), axis=1)
# drop rows with missing values
df = df.dropna()

print(df.shape)
df.head()

(4211, 17)


Unnamed: 0,job_id,company_id,title,description,formatted_work_type,location,applies,original_listed_time,views,job_posting_url,application_type,expiry,formatted_experience_level,listed_time,sponsored,work_type,scraped
9,3757936026,634806.0,Instrumentation Quality Control Representative...,Instrumentation Quality Control Representative...,Contract,United States,12.0,1699080000000.0,59.0,https://www.linkedin.com/jobs/view/3757936026/...,SimpleOnsiteApply,1704270000000.0,Entry level,1699130000000.0,0,CONTRACT,1699137876
15,3757935012,18583501.0,Quantitative Trader [5048],"Quantitative traders research, develop, and re...",Full-time,"New York, United States",36.0,1699080000000.0,158.0,https://www.linkedin.com/jobs/view/3757935012/...,ComplexOnsiteApply,1701670000000.0,Associate,1699080000000.0,0,FULL_TIME,1699135011
47,3757932930,2073870.0,"Director, Contributor Experience",About Appen\n\nAppen is a leader in AI enablem...,Full-time,"Texas, United States",3.0,1696920000000.0,64.0,https://www.linkedin.com/jobs/view/3757932930/...,OffsiteApply,1701680000000.0,Director,1699080000000.0,0,FULL_TIME,1699134485
122,3757928945,91313799.0,Software Quality Assurance Engineer [5291],Our mission is to enable a cure for the millio...,Full-time,"Burlington, MA",82.0,1699080000000.0,189.0,https://www.linkedin.com/jobs/view/3757928945/...,ComplexOnsiteApply,1701670000000.0,Entry level,1699080000000.0,0,FULL_TIME,1699132379
155,3757922078,7170.0,"Specialist, Accounts Receivable","Job Description\n\nThe Specialist, AR position...",Full-time,United States,32.0,1699080000000.0,83.0,https://www.linkedin.com/jobs/view/3757922078/...,OffsiteApply,1701670000000.0,Entry level,1699080000000.0,0,FULL_TIME,1699084381


In [23]:
# Encode categorical features
for column in df.select_dtypes(include=['object']).columns:
    df[column] = df[column].astype(str)  # Ensure column is treated as string
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

# Separate the target variable ('views') from the features
X = df.drop('views', axis=1)
y = df['views']

# Apply SelectKBest with chi2 to select the features
k_best_relevant = SelectKBest(score_func=chi2, k='all')
fit_relevant = k_best_relevant.fit(X, y)

# Create a DataFrame to display feature scores
features_scores_relevant = pd.DataFrame({
    'Feature': X.columns,
    'Score': fit_relevant.scores_
}).sort_values(by='Score', ascending=False)

print(features_scores_relevant)

                       Feature         Score
15                     scraped  7.027829e+11
1                   company_id  2.792356e+10
10                      expiry  4.720428e+09
7         original_listed_time  3.444984e+09
12                 listed_time  3.351462e+09
0                       job_id  1.378258e+08
8              job_posting_url  4.777918e+05
6                      applies  4.721927e+05
3                  description  4.390861e+05
2                        title  3.465977e+05
5                     location  9.489466e+04
11  formatted_experience_level  7.985491e+02
13                   sponsored  4.030190e+02
9             application_type  3.707200e+02
4          formatted_work_type  2.295796e+02
14                   work_type  2.295796e+02


In [24]:
# analyze the description for more views and aplications
# Create a TfidfVectorizer
vectorizer = TfidfVectorizer()

# Apply the vectorizer
X = vectorizer.fit_transform(df['description'].astype(str))
y = df['views']

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a LinearRegression model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Predict the target variable
y_pred = model.predict(X_test)

# Print the R2 score
print(r2_score(y_test, y_pred))

# Print the mean squared error
print(mean_squared_error(y_test, y_pred))

# Create a DataFrame with the features and coefficients
df = pd.DataFrame({
    'feature': vectorizer.get_feature_names(),
    'coef': model.coef_
})

# Sort the DataFrame by the coefficient
df = df.sort_values('coef', ascending=False)

# Display the 10 most positive words
print(df.head(10))

# Display the 10 most negative words
print(df.tail(10))



-0.07842731959569749
51615.36018540888
['10' '100' '1000' ... '997' '998' '999']
     feature         coef
2364    3129  3534.333333
632     1569  3330.333333
1883    2696  2355.333333
2309     308  2141.333333
3655     665  2008.333333
2466    3220  1762.333333
2698     343  1712.333333
3564     582  1588.333333
1524    2372  1432.333333
2187     297  1427.333333
     feature       coef
872     1785 -71.666667
180     1161 -71.666667
2749    3476 -71.666667
331     1298 -71.666667
3535     556 -71.666667
1301    2171 -71.666667
3672     680 -71.666667
3674     682 -71.666667
2227    3005 -71.666667
503     1452 -71.666667
