In [4]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from collections import Counter
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Assuming you have a CSV file
df = pd.read_csv('/kaggle/input/job-postings/job_postings.csv')


In [3]:
# Checking for null values and data types
print(df.info())
print(df.describe(include='all'))
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        2499 non-null   object
 1   Company      2313 non-null   object
 2   Location     2493 non-null   object
 3   Description  2499 non-null   object
 4   Skills       2499 non-null   object
 5   Link         2500 non-null   object
dtypes: object(6)
memory usage: 117.3+ KB
None
             Title           Company    Location  \
count         2499              2313        2493   
unique        1799              1057         143   
top     Accountant  Al Futtaim Group  Dubai, UAE   
freq            36                86         842   

                                              Description  \
count                                                2499   
unique                                               2406   
top     Live the cabin crew life  Join our team as Emi...   
freq          

In [5]:
title_counts = df['Title'].value_counts().nlargest(10).reset_index()
title_counts.columns = ['Job Title', 'Frequency']

fig = px.bar(title_counts, x='Job Title', y='Frequency', title='Top 10 Job Titles')
fig.update_layout(xaxis_title='Job Title', yaxis_title='Frequency', xaxis_tickangle=-45)
fig.show()


In [6]:
company_counts = df['Company'].value_counts().nlargest(10).reset_index()
company_counts.columns = ['Company', 'Number of Postings']

fig = px.bar(company_counts, x='Company', y='Number of Postings', title='Top 10 Companies with Most Job Postings')
fig.update_layout(xaxis_title='Company', yaxis_title='Number of Postings', xaxis_tickangle=-45)
fig.show()


In [7]:
location_counts = df['Location'].value_counts().nlargest(10).reset_index()
location_counts.columns = ['Location', 'Frequency']

fig = px.bar(location_counts, x='Location', y='Frequency', title='Top 10 Job Locations')
fig.update_layout(xaxis_title='Location', yaxis_title='Frequency', xaxis_tickangle=-45)
fig.show()


In [8]:
skills = df['Skills'].dropna().str.split(',').sum()
skills_counter = Counter([skill.strip() for skill in skills])
skills_df = pd.DataFrame(skills_counter.most_common(10), columns=['Skill', 'Frequency'])

fig = px.bar(skills_df, x='Skill', y='Frequency', title='Top 10 Required Skills')
fig.update_layout(xaxis_title='Skill', yaxis_title='Frequency', xaxis_tickangle=-45)
fig.show()


In [9]:
# Creating a list of words from descriptions excluding stop words
descriptions = ' '.join(df['Description'].fillna(''))
vectorizer = CountVectorizer(stop_words='english')
word_counts = vectorizer.fit_transform([descriptions])
word_freq = dict(zip(vectorizer.get_feature_names_out(), word_counts.toarray()[0]))

In [12]:
# Convert the dictionary to a DataFrame for Plotly
top_words_df = pd.DataFrame(Counter(word_freq).most_common(10), columns=['Word', 'Frequency'])

fig = px.bar(top_words_df, x='Word', y='Frequency', title='Top 10 Most Common Words in Job Descriptions')
fig.update_layout(xaxis_title='Word', yaxis_title='Frequency', xaxis_tickangle=-45)
fig.show()

In [15]:
# Adding a new column for word count in each job description
df['Word_Count'] = df['Description'].apply(lambda x: len(str(x).split()))

# Finding the maximum, minimum, and average word counts
max_words = df['Word_Count'].max()
min_words = df['Word_Count'].min()
avg_words = df['Word_Count'].mean()

print(f"Description with the most words: {max_words}")
print(f"Description with the least words: {min_words}")
print(f"Average word count: {avg_words:.2f}")

Description with the most words: 1182
Description with the least words: 1
Average word count: 164.34


In [19]:
# Adding a new column for word count in each job description
df['Word_Count_skills'] = df['Skills'].apply(lambda x: len(str(x).split()))

# Finding the maximum, minimum, and average word counts
max_words = df['Word_Count_skills'].max()
min_words = df['Word_Count_skills'].min()
avg_words = df['Word_Count_skills'].mean()

print(f"Description with the most words: {max_words}")
print(f"Description with the least words: {min_words}")
print(f"Average word count: {avg_words:.2f}")

Description with the most words: 502
Description with the least words: 1
Average word count: 66.15
