# Importing toolkit

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px 
from plotly.offline import iplot
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")
pd.set_option('future.no_silent_downcasting', True)
pd.options.mode.copy_on_write = "warn"

## Customized Vizualizations Function

In [None]:
# Adding Line To Plotly Figure
def add_line(x0 = 0, y0 = 0, x1 = 0, y1 = 0, 
             line_color = "#00DFA2", font_color = "#3C486B", 
             xposition = "right", text = "Text"):
    fig.add_shape(type='line',
                  x0 = x0,
                  y0 = y0,
                  x1 = x1,
                  y1 = y1 + 2,
                  line = {
                      "color" : line_color,
                      "width" : 3,
                      "dash" : "dashdot"
                  },
                  label={
                      "text" : f"\t{text}: {x1: 0.1f}\t".expandtabs(5),
                      "textposition": "end",
                      "yanchor" :"top",
                      "xanchor" :xposition,
                      "textangle" :0,
                      "font": {
                          "size": 14,
                          "color" :font_color,
                          "family" : "arial"
                      },
                  }
                 )

In [None]:
def custome_layout(title_size = 28, hover_font_size = 16, showlegend = False):
    fig.update_layout(
    showlegend = showlegend,
    title = {
        "font" :{
            "size" :title_size,
            "family" : "tahoma"
        }
    },
    hoverlabel = {
        "bgcolor" :"#111",
        "font_size" : hover_font_size,
        "font_family" :"arial"
    }
)

# Loading data

In [None]:
df = pd.read_csv("ML1.csv")

# Overview

In [None]:
df.info()

In [None]:
df.sample(10, random_state=15)

In [None]:
df.describe().T

# Data preprocessing

## Checking for nan values

In [None]:
# Checking For Duplicates
df.isna().sum()

In [None]:
# Nan Values
df[df["Age"].isna()]

<p style = "color: gold;
            font: bold 22px arial;
            padding: 15px;
            background-color: #112">
    • Removing these two records..
</p>

In [None]:
# Droping Records That Contain Only Nan Values 
df.dropna(inplace=True)
# Checking Again For Nan Values
df.isna().sum()

## Checking for duplicates records

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()].head(15)

<p style = "color: gold;
            font: bold 22px arial;
            padding: 15px;
            background-color: #112">
    • Removing the duplicate rows
</p>

In [None]:
# Droping Duplicates Records
df.drop_duplicates(inplace=True)
# Reseting Index to be Orderd
df.reset_index(inplace=True, drop=True)

In [None]:
df.head()

# Column

<p style = "color: #98EECC;
            font: bold 18px arial;
            padding: 15px;
            background-color: #111;
            border: 3px solid lightgreen;
            border-radius: 8px">
    Age Column
</p>

In [None]:
mean_of_age = df["Age"].mean()
median_of_age = df["Age"].median()

In [None]:
fig = px.box(
    y=df["Age"], 
    title= "Ages Distribution",
    template="plotly_dark",
    labels={"y" :"Age"},
)
custome_layout()
iplot(fig)

In [None]:
fig = px.histogram(
    df["Age"], 
    nbins=25,
    title= "Age Distribution",
    template="plotly_dark",
    labels={"value" :"Age"}
)
custome_layout()
fig.update_traces(
    textfont = {
        "size" : 20,
        "family" :"tahoma",
        "color": "#fff"
    },
    hovertemplate = "Age: %{x}<br>Frequency: %{y}",
    marker=dict(line=dict(color='#000', width=0.1))
)
# Adding Mean Line
add_line(x0=mean_of_age, y0=0, x1=mean_of_age, y1=30+2, line_color="#E97777",font_color="#E97777", 
         text="Mean", xposition="left")
# Adding Median Line
add_line(x0=median_of_age, y0=0, x1=median_of_age, y1=30+2, line_color="#FFE5F1",
         font_color="#fff", xposition="right", text="Median")
iplot(fig)

<p style = "color: #98EECC;
            font: bold 18px arial;
            padding: 15px;
            background-color: #111;
            border: 3px solid lightgreen;
            border-radius: 8px">
    Gender Column
</p>

In [None]:
gender = df["Gender"].value_counts(normalize=1) * 100
gender.apply(lambda x: f"{x:0.2f}%")

In [None]:
fig = px.bar(data_frame = gender,
             x = gender.index,
             y = gender,
             color = gender.index,
             title = "Gender Frequency (PCT)",
             color_discrete_sequence=["#45FFCA", "#FF9B9B"],
             labels= {"index" :"Gender", "y": "Frequency in PCT(%)"},
             template="plotly_dark",
             text = gender.apply(lambda x: f"{x:0.0f}%"))
custome_layout()
fig.update_traces(
    textfont = {
        "size" : 16,
        "family" :"arial",
        "color": "#222"
    },
    hovertemplate = "Gender: %{x}<br>Percentage: %{y:0.1f}%",
)
iplot(fig)

<p style = "color: #98EECC;
            font: bold 18px arial;
            padding: 15px;
            background-color: #111;
            border: 3px solid lightgreen;
            border-radius: 8px">
    Education Level Column
</p>

In [None]:
education = df["Education Level"].value_counts(normalize=1) * 100
education.apply(lambda x: f"{x:0.2f}%")

In [None]:
fig = px.bar(data_frame = education,
             x = education.index,
             y = education,
             color = education.index,
             title = "Education Frequency (PCT)",
             color_discrete_sequence=["#45FFCA", "#D09CFA", "#FF9B9B"],
             labels= {"index" :"Education", "y": "Frequency in PCT(%)"},
             template="plotly_dark",
             text = education.apply(lambda x: f"{x:0.0f}%"))
custome_layout()
fig.update_traces(
    textfont = {
        "size" : 16,
        "family" :"arial",
        "color": "#222"
    },
    hovertemplate = "Education: %{x}<br>Percentage: %{y:0.1f}%",
)
iplot(fig)

<p style = "color: #98EECC;
            font: bold 18px arial;
            padding: 15px;
            background-color: #111;
            border: 3px solid lightgreen;
            border-radius: 8px">
    Experience Column
</p>

In [None]:
fig = px.box(
    y=df["Years of Experience"], 
    title= "Experience Years Distribution",
    template="plotly_dark",
    labels={"y" :"EXP Years"},
)
custome_layout()
iplot(fig)

<p style = "color: #98EECC;
            font: bold 18px arial;
            padding: 15px;
            background-color: #111;
            border: 3px solid lightgreen;
            border-radius: 8px">
    Salary Column
</p>

In [None]:
fig = px.box(
    x = df["Education Level"], y = df["Salary"],
    title= "Salary And Education Level",
    template="plotly_dark",
    labels={"x": "Education Level", "y" :"Salary"}
)
custome_layout(hover_font_size=13)
iplot(fig)

# Insights

In [None]:
salary_by_gender = df.groupby("Gender")["Salary"].mean().sort_values(ascending=False)
salary_by_gender.apply(lambda x: f"${x:,.2f}")

In [None]:
fig = px.bar(data_frame = salary_by_gender,
             x = salary_by_gender.index,
             y = salary_by_gender,
             color = salary_by_gender.index,
             title = "AVG Salary By Gender",
             color_discrete_sequence=["#45FFCA", "#D09CFA", "#FF9B9B"],
             labels= {"index" :"Education", "y": "Frequency in PCT(%)"},
             template="plotly_dark",
             text_auto = "0.4s" 
            )
custome_layout()
fig.update_traces(
    textfont = {
        "size" : 16,
        "family" :"arial",
        "color": "#222"
    },
    hovertemplate = "Gender: %{x}<br>Average Salary: $%{y:0.4s}",
)
iplot(fig)

In [None]:
salary_by_education = df.groupby("Education Level")["Salary"].mean().sort_values(ascending=False)
salary_by_education.apply(lambda x: f"${x:,.2f}")

In [None]:
fig = px.bar(data_frame = salary_by_education,
             x = salary_by_education.index,
             y = salary_by_education,
             color = salary_by_education.index,
             title = "AVG Salary Via Education Level",
             color_discrete_sequence=["#45FFCA", "#D09CFA", "#FF9B9B"],
             labels= {"index" :"Education", "y": "Frequency in PCT(%)"},
             template="plotly_dark",
             text_auto = "0.4s" 
            )
custome_layout()
fig.update_traces(
    textfont = {
        "size" : 16,
        "family" :"arial",
        "color": "#222"
    },
    hovertemplate = "Education Level: %{x}<br>Average Salary: $%{y:0.4s}",
)
iplot(fig)

In [None]:
def groupping_exp(exp):
    if exp >= 0 and exp <= 5:
        return "0-5 years"
    elif exp > 5 and exp <= 10:
        return "6-10 years"
    elif exp > 10 and exp <= 15:
        return "11-15 years"
    elif exp > 15 and exp <= 20:
        return "16-20 years"
    else:
        return "20+"       

In [None]:
salary_by_exp = df.groupby(df["Years of Experience"].apply(groupping_exp))["Salary"].mean().sort_values(ascending=False)
salary_by_exp.apply(lambda x: f"${x:,.2f}")

In [None]:
fig = px.bar(data_frame = salary_by_exp,
             x = salary_by_exp.index,
             y = salary_by_exp,
             color = salary_by_exp.index,
             title = "AVG Salary By Gender",
             color_discrete_sequence=["#45FFCA", "#D09CFA", "#FF9B9B", "#F875AA", "#3EDBF0"],
             labels= {"index" :"Education", "y": "Frequency in PCT(%)"},
             template="plotly_dark",
             text_auto = "0.4s" 
            )
custome_layout()
fig.update_traces(
    textfont = {
        "size" : 16,
        "family" :"arial",
        "color": "#222"
    },
    hovertemplate = "Gender: %{x}<br>Average Salary: $%{y:0.4s}",
)
iplot(fig)

# Correlation HeatMap & Charts

In [None]:
correlation = df.corr(numeric_only=True)
fig = px.imshow(
    correlation,
    template = "plotly_dark",
    text_auto = "0.2f",
    aspect=1,
    color_continuous_scale="orrd",
    title= "Correlations Between Data"
)
fig.update_layout(
    title = {
        "font" :{
            "size" : 28,
            "family" : "tahoma"
        }
    }
)
iplot(fig)

In [None]:
fig = px.scatter_matrix(
    df,
    dimensions=df.select_dtypes(include="number").columns,
    height=800,
    color="Salary",
    opacity=0.65,
    title= "Relationships Between Numerical Data",
    template="plotly_dark"
)
fig.update_layout(
    title = {
        "font" :{
            "size" : 28,
            "family" : "tahoma"
        }
    }
)
iplot(fig)

<h3 style = "font: bold 18px arial;
             color: gold;
             background-color: #111;
             padding: 15px;
             border: 2px solid orangere">
    ► From These Graphs & Explorations Can Be Find Out That:
    <br>
    <br>
    • There is a <b style = "color: tomato">Strong</b> Linear Positive Correlation between Salary and Age
    <br>
    <br>
    • There is a <b style = "color: tomato">Strong</b> Linear Positive Correlation between Salary and Experience Years
</h3>

<p style = "color: #F3CCFF;
            font: bold 22px arial;
            padding: 15px;
            background-color: #111;
            border: 3px solid violet;
            border-radius: 8px">
    1] Encoding Categorical Data: (Converting Categorical Into Numerical)
</p>

In [None]:
df_encoded = pd.get_dummies(df, columns=["Education Level"], drop_first=True) *1
df_encoded.head()

<p style = "color: #F3CCFF;
            font: bold 22px arial;
            padding: 15px;
            background-color: #111;
            border: 3px solid violet;
            border-radius: 8px">
    2]
</p>

In [None]:
X = df_encoded.drop(columns=["Job Title", "Salary", "Gender"])
y = df_encoded["Salary"]

In [None]:
X.head()

<p style = "color: #F3CCFF;
            font: bold 22px arial;
            padding: 15px;
            background-color: #111;
            border: 3px solid violet;
            border-radius: 8px">
    3]
</p>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90)

<p style = "color: #F3CCFF;
            font: bold 22px arial;
            padding: 15px;
            background-color: #111;
            border: 3px solid violet;
            border-radius: 8px">
    4] Cross Validation Score
</p>

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=30)

In [None]:
rf = RandomForestRegressor(n_estimators=500, random_state=11)

In [None]:
scores = cross_val_score(rf, X, y, cv=kf)
print(f"Cross Validation Score: {np.mean(scores)*100:0.2f}%")

<p style = "color: #F3CCFF;
            font: bold 22px arial;
            padding: 15px;
            background-color: #111;
            border: 3px solid violet;
            border-radius: 8px">
    5]
</p>

In [None]:
rf.fit(X_train, y_train)

In [None]:
score = rf.score(X_train, y_train)*100
print(f"Model Score: {np.round(score, 2)}%")

<p style = "color: #F3CCFF;
            font: bold 22px arial;
            padding: 15px;
            background-color: #111;
            border: 3px solid violet;
            border-radius: 8px">
    6]
</p>

In [None]:
# It was said:
expected_salary = np.round(rf.predict(X_test))
# was said.

In [None]:
# It was said:
d = {
    "Actual_Salary" : y_test,
    "Expected_Salary" : expected_salary,
    "error": expected_salary - y_test
}
expected_df = pd.DataFrame(d)
expected_df.head()
# was said.

In [None]:
# It was said:
score = r2_score(y_test, expected_salary)*100
print(f"Model Score: {np.round(score, 2)}%")
# was said.

In [None]:
# It was said:
rmse = np.sqrt(mean_squared_error(y_test, expected_salary))
print(f"Error Ratio: {rmse:.3f}")
# was said.

In [None]:
# It was said:
fig = px.scatter(
    expected_df, 
    x = "Actual_Salary", 
    y = "Expected_Salary",
    color = "error",
    opacity=0.8,
    title= "Expected And Actual",
    template="plotly_dark",
    trendline="ols"
)
fig.update_layout(
    title = {
        "font" :{
            "size" : 28,
            "family" : "tahoma"
        }
    }
)
iplot(fig)
# was said.

# GitHub Repo: <a href = "https://github.com/abed-r-j/ML1">Click Here To Go To GitHub Repo..</a>
