In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_excel('path_to_your_dataset.xlsx')

# Display the first few rows
print(df.head())

# Check the shape of the dataset
print(f"Shape of the dataset: {df.shape}")

# Summary statistics
print(df.describe())

# Setting up the plot style for clarity
sns.set(style="whitegrid")

# Creating subplots for Salary, 10th percentage, 12th percentage, and College GPA
fig, axs = plt.subplots(2, 2, figsize=(14, 10))

# Salary distribution
sns.histplot(df['Salary'], bins=30, kde=True, ax=axs[0, 0], color='blue')
axs[0, 0].set_title('Salary Distribution')

# 10th percentage distribution
sns.histplot(df['10percentage'], bins=30, kde=True, ax=axs[0, 1], color='green')
axs[0, 1].set_title('10th Percentage Distribution')

# 12th percentage distribution
sns.histplot(df['12percentage'], bins=30, kde=True, ax=axs[1, 0], color='red')
axs[1, 0].set_title('12th Percentage Distribution')

# College GPA distribution
sns.histplot(df['collegeGPA'], bins=30, kde=True, ax=axs[1, 1], color='orange')
axs[1, 1].set_title('College GPA Distribution')

plt.tight_layout()
plt.show()

# Boxplots to identify outliers
fig, axs = plt.subplots(2, 2, figsize=(14, 10))

# Boxplot for Salary
sns.boxplot(x=df['Salary'], ax=axs[0, 0], color='blue')
axs[0, 0].set_title('Boxplot of Salary')

# Boxplot for 10th percentage
sns.boxplot(x=df['10percentage'], ax=axs[0, 1], color='green')
axs[0, 1].set_title('Boxplot of 10th Percentage')

# Boxplot for 12th percentage
sns.boxplot(x=df['12percentage'], ax=axs[1, 0], color='red')
axs[1, 0].set_title('Boxplot of 12th Percentage')

# Boxplot for College GPA
sns.boxplot(x=df['collegeGPA'], ax=axs[1, 1], color='orange')
axs[1, 1].set_title('Boxplot of College GPA')

plt.tight_layout()
plt.show()

# Scatter plot between Salary and 10th percentage
sns.scatterplot(x='10percentage', y='Salary', data=df)
plt.title('Scatter plot between 10th Percentage and Salary')
plt.show()

# Scatter plot between Salary and College GPA
sns.scatterplot(x='collegeGPA', y='Salary', data=df)
plt.title('Scatter plot between College GPA and Salary')
plt.show()

# Boxplot of Salary by Gender
sns.boxplot(x='Gender', y='Salary', data=df)
plt.title('Boxplot of Salary by Gender')
plt.show()

# Boxplot of Salary by Specialization
sns.boxplot(x='Specialization', y='Salary', data=df)
plt.xticks(rotation=90)
plt.title('Boxplot of Salary by Specialization')
plt.show()

# Stacked bar plot for Gender and Specialization
pd.crosstab(df['Gender'], df['Specialization']).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Plot of Gender and Specialization')
plt.show()
e
# Subset of data for specific job titles
engineer_roles = ['Programming Analyst', 'Software Engineer', 'Hardware Engineer', 'Associate Engineer']
engineer_data = df[df['Designation'].isin(engineer_roles)]

# Average salary of engineers
avg_engineer_salary = engineer_data['Salary'].mean()
print(f"The average salary of fresh engineers is: {avg_engineer_salary}")

# Checking if the salary falls within the claim of 2.5-3 lakhs
if 250000 <= avg_engineer_salary <= 300000:
    print("The claim is correct.")
else:
    print("The claim is not correct.")


# Chi-square test for independence between Gender and Specialization
from scipy.stats import chi2_contingency

# Create a contingency table
contingency_table = pd.crosstab(df['Gender'], df['Specialization'])

# Perform chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Results
print(f"Chi-square statistic: {chi2}, p-value: {p}")
if p < 0.05:
    print("There is a significant relationship between Gender and Specialization.")
else:
    print("There is no significant relationship between Gender and Specialization.")