## Saving dataset from Datacamp

In [None]:
salaries.to_csv("salaries.csv", index=False)

In [None]:
spotify_sample.to_csv("spotify_sample.csv", index=False)

In [None]:
with open("spotify_sample.csv", "r") as file:
    print(file.read())

## Importing packages

In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from jupyter_datatables import init_datatables_mode
from ydata_profiling import ProfileReport

In [80]:
from scipy.stats import binom
from scipy.stats import uniform
from scipy.stats import norm
from scipy.stats import poisson

## Using .select_dtypes()

In [3]:
import pandas as pd
import numpy as np

# Create a sample DataFrame with various data types
data = {
    'ID': [101, 102, 103, 104, 105],
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Ethan'],
    'Age': [25, 30, 22, 28, 35],
    'Height': [5.5, 6.0, 5.7, 5.8, 6.1],
    'Weight': [55.2, 78.5, 60.0, 62.3, 80.1],
    'Is_Student': [True, False, True, False, False],
    'Gender': ['F', 'M', 'M', 'F', 'M'],
    'Enrollment_Date': pd.to_datetime(['2023-01-01', '2022-09-15', '2021-06-10', '2020-11-20', '2023-03-03']),
    'Score': [np.nan, 88.5, 92.3, np.nan, 85.0],
    'Remarks': [None, 'Excellent', 'Good', 'Average', 'Excellent']
}

df = pd.DataFrame(data)

print("Full DataFrame:")
df

Full DataFrame:


Unnamed: 0,ID,Name,Age,Height,Weight,Is_Student,Gender,Enrollment_Date,Score,Remarks
0,101,Alice,25,5.5,55.2,True,F,2023-01-01,,
1,102,Bob,30,6.0,78.5,False,M,2022-09-15,88.5,Excellent
2,103,Charlie,22,5.7,60.0,True,M,2021-06-10,92.3,Good
3,104,Diana,28,5.8,62.3,False,F,2020-11-20,,Average
4,105,Ethan,35,6.1,80.1,False,M,2023-03-03,85.0,Excellent


In [7]:
# Select only numeric columns (int and float)
numeric_df = df.select_dtypes(include='number')
print("\nNumeric Columns:")
numeric_df


Numeric Columns:


Unnamed: 0,ID,Age,Height,Weight,Score
0,101,25,5.5,55.2,
1,102,30,6.0,78.5,88.5
2,103,22,5.7,60.0,92.3
3,104,28,5.8,62.3,
4,105,35,6.1,80.1,85.0


In [9]:
# Select only object (string-like) columns
object_df = df.select_dtypes(include='object')
print("\nObject (string-like) Columns:")
object_df


Object (string-like) Columns:


Unnamed: 0,Name,Gender,Remarks
0,Alice,F,
1,Bob,M,Excellent
2,Charlie,M,Good
3,Diana,F,Average
4,Ethan,M,Excellent


In [11]:
# Select boolean columns
bool_df = df.select_dtypes(include='bool')
print("\nBoolean Columns:")
bool_df


Boolean Columns:


Unnamed: 0,Is_Student
0,True
1,False
2,True
3,False
4,False


In [13]:
# Select datetime columns
datetime_df = df.select_dtypes(include='datetime')
print("\nDatetime Columns:")
datetime_df


Datetime Columns:


Unnamed: 0,Enrollment_Date
0,2023-01-01
1,2022-09-15
2,2021-06-10
3,2020-11-20
4,2023-03-03


In [15]:
selected_df = df.select_dtypes(include=['number', 'bool'])
selected_df

Unnamed: 0,ID,Age,Height,Weight,Is_Student,Score
0,101,25,5.5,55.2,True,
1,102,30,6.0,78.5,False,88.5
2,103,22,5.7,60.0,True,92.3
3,104,28,5.8,62.3,False,
4,105,35,6.1,80.1,False,85.0


## Using np.select()

In [19]:
import pandas as pd
import numpy as np

# Sample DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Ethan'],
    'Age': [25, 17, 34, 15, 45]
})

df

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,17
2,Charlie,34
3,Diana,15
4,Ethan,45


In [21]:
# Define conditions
conditions = [
    df['Age'] < 18,
    (df['Age'] >= 18) & (df['Age'] < 30),
    df['Age'] >= 30
]

# Define choices for each condition
choices = ['Teenager', 'Young Adult', 'Adult']

# Apply np.select
df['Age_Group'] = np.select(conditions, choices, default='Unknown')

df

Unnamed: 0,Name,Age,Age_Group
0,Alice,25,Young Adult
1,Bob,17,Teenager
2,Charlie,34,Adult
3,Diana,15,Teenager
4,Ethan,45,Adult


## Using np.where()

In [26]:
import pandas as pd
import numpy as np

# Sample DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Ethan'],
    'Score': [85, 40, 67, 90, 55]
})

df

Unnamed: 0,Name,Score
0,Alice,85
1,Bob,40
2,Charlie,67
3,Diana,90
4,Ethan,55


In [28]:
# Apply np.where: If score >= 60, then 'Pass', else 'Fail'
df['Result'] = np.where(df['Score'] >= 60, 'Pass', 'Fail')

print(df)

      Name  Score Result
0    Alice     85   Pass
1      Bob     40   Fail
2  Charlie     67   Pass
3    Diana     90   Pass
4    Ethan     55   Fail


## Using .map()

In [31]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Ethan'],
    'Gender': ['F', 'M', 'M', 'F', 'M']
})

df

Unnamed: 0,Name,Gender
0,Alice,F
1,Bob,M
2,Charlie,M
3,Diana,F
4,Ethan,M


In [33]:
# Map shorthand gender to full form
gender_map = {'M': 'Male', 'F': 'Female'}
df['Gender_Full'] = df['Gender'].map(gender_map)

print(df)

      Name Gender Gender_Full
0    Alice      F      Female
1      Bob      M        Male
2  Charlie      M        Male
3    Diana      F      Female
4    Ethan      M        Male


## Using .query()

In [38]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Ethan'],
    'Age': [25, 17, 34, 15, 45],
    'Gender': ['F', 'M', 'M', 'F', 'M'],
    'Score': [85, 40, 67, 90, 55]
})

df

Unnamed: 0,Name,Age,Gender,Score
0,Alice,25,F,85
1,Bob,17,M,40
2,Charlie,34,M,67
3,Diana,15,F,90
4,Ethan,45,M,55


In [40]:
# Use .query() to filter rows where Age >= 18 and Score > 60
adults_with_high_scores = df.query('Age >= 18 and Score > 60')

print(adults_with_high_scores)

      Name  Age Gender  Score
0    Alice   25      F     85
2  Charlie   34      M     67


In [44]:
df.query('`Name` == "Bob"')

Unnamed: 0,Name,Age,Gender,Score
1,Bob,17,M,40


## .query() with in and not in 

In [67]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Ethan'],
    'Department': ['HR', 'Sales', 'IT', 'HR', 'Finance'],
    'Salary': [50000, 60000, 70000, 52000, 75000]
})

df

Unnamed: 0,Name,Department,Salary
0,Alice,HR,50000
1,Bob,Sales,60000
2,Charlie,IT,70000
3,Diana,HR,52000
4,Ethan,Finance,75000


In [69]:
# Define a list of departments
target_departments = ['HR', 'IT']

# Use query with @ to refer to external variable
in_result = df.query('Department in @target_departments')
not_in_result = df.query('Department not in @target_departments')

print("Employees in HR or IT:")
in_result

Employees in HR or IT:


Unnamed: 0,Name,Department,Salary
0,Alice,HR,50000
2,Charlie,IT,70000
3,Diana,HR,52000


In [71]:
print("\nEmployees NOT in HR or IT:")
not_in_result


Employees NOT in HR or IT:


Unnamed: 0,Name,Department,Salary
1,Bob,Sales,60000
4,Ethan,Finance,75000


## Using reset_index()

In [59]:
import pandas as pd

# Sample DataFrame with custom index
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'Age': [25, 30, 22, 28]
}, index=['a', 'b', 'c', 'd'])

df

Unnamed: 0,Name,Age
a,Alice,25
b,Bob,30
c,Charlie,22
d,Diana,28


In [61]:
# Reset the index and rename columns
df_reset = df.reset_index()
df_reset

Unnamed: 0,index,Name,Age
0,a,Alice,25
1,b,Bob,30
2,c,Charlie,22
3,d,Diana,28


In [63]:
# Rename columns after reset_index
df_reset.rename(columns={'index': 'Custom_Index', 'Name': 'Full_Name', 'Age': 'Age_Years'}, inplace=True)

print("DataFrame after reset_index() and renaming columns:")
df_reset

DataFrame after reset_index() and renaming columns:


Unnamed: 0,Custom_Index,Full_Name,Age_Years
0,a,Alice,25
1,b,Bob,30
2,c,Charlie,22
3,d,Diana,28


In [65]:
# Reset the index and drop the old index column
df_reset_drop = df.reset_index(drop=True)

print("\nDataFrame after reset_index(drop=True):")
df_reset_drop


DataFrame after reset_index(drop=True):


Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,22
3,Diana,28


## Using .cat_categories()

In [38]:
import pandas as pd

# Create a DataFrame with a categorical column
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'JobRole': pd.Categorical(['Engineer', 'Manager', 'Engineer', 'Clerk'])
})

df

Unnamed: 0,Name,JobRole
0,Alice,Engineer
1,Bob,Manager
2,Charlie,Engineer
3,David,Clerk


In [40]:
df['JobRole'].cat.categories

Index(['Clerk', 'Engineer', 'Manager'], dtype='object')

In [42]:
filtered_df = df[df['JobRole'] == 'Engineer']
filtered_df

Unnamed: 0,Name,JobRole
0,Alice,Engineer
2,Charlie,Engineer


In [44]:
filtered_df['JobRole'].cat.categories

Index(['Clerk', 'Engineer', 'Manager'], dtype='object')

Even though only `"Engineer"` is used, `"Clerk"` and `"Manager"` are still listed as categories — they're unused.

In [50]:
filtered_df.loc[:,'JobRole'] = filtered_df['JobRole'].cat.remove_unused_categories().astype('object')
filtered_df['JobRole'].cat.categories

Index(['Engineer'], dtype='object')

Now the JobRole column only contains the categories that actually exist in the filtered data.

In [52]:
filtered_df = filtered_df.copy()

In [54]:
filtered_df.loc['JobRole'] = filtered_df['JobRole'].cat.remove_unused_categories()
filtered_df['JobRole'].cat.categories

Index(['Engineer'], dtype='object')

In [21]:
import pandas as pd

# Original data with 5 rows
attrition_pop = pd.DataFrame({
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eve']
})

attrition_pop

Unnamed: 0,Employee
0,Alice
1,Bob
2,Charlie
3,David
4,Eve


In [23]:
shuffled = attrition_pop.sample(frac=1, replace=False)
shuffled

Unnamed: 0,Employee
0,Alice
2,Charlie
4,Eve
1,Bob
3,David


In [25]:
resampled = attrition_pop.sample(frac=1, replace=True)
resampled

Unnamed: 0,Employee
4,Eve
1,Bob
4,Eve
4,Eve
0,Alice


In [None]:

spotify_sample.to_csv("spotify_sample.csv", index=False)

with open("spotify_sample.csv", "r") as file:
    print(file.read())

In [None]:
import pandas as pd
import numpy as np

# Assuming np_cont is a NumPy array of continents (as in your original code)
# Convert it to a pandas Series
continent_series = pd.Series(np_cont)

# Create a color mapping dictionary (as in your original code)
col_dict = {
    'Asia': 'red',
    'Europe': 'green',
    'Africa': 'blue',
    'Americas': 'yellow',
    'Oceania': 'black'
}

# Use .map() to create the color list
col = continent_series.map(col_dict).tolist()

# Now 'col' contains the list of colors corresponding to each continent
# You can use 'col' in your scatter plot
