# Installing required libraries and files

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
df = pd.read_csv('campus.csv')

# Making initial clearings

In [None]:
# checkpoint
check1 = df.copy()

In [None]:
df.head()

In [None]:
df.drop(axis=1, inplace=True, columns='Timestamp')


In [None]:
df.info()

In [None]:
print("All column names\n\n")
print(df.columns.values)

In [None]:
# Renaming columns
df.rename(columns = {'What is your current year of study':'year', 
                     'What is your current relationship status?': 'relationship',
                     'How do you prefer studying before exams?': 'study',
                     'Do you do competitive programming?': 'CP',
                     'Which hostel do you live in?': 'hostel',
                     'What is your gender?': 'gender',
                     'What is your favourite sport?': 'sport',
                     'Favourite hangout place in campus': 'hangout_spot',
                     'Which state/union territory do you belong to?': 'state',
                     'What is your dept?': 'dept',
                     'What is your preferred coding language?': 'coding_lang',
                     'What is your favourite place to eat at IIT Guwahati?': 'fav_place_to_eat',
                     'Which is your favourite club in IIT Guwahati?': 'fav_club',
                     'Are you satisfied in IIT Guwahati': 'satisfied',
                     'What is your current cpi?': 'cpi'}, inplace = True)


In [None]:
print(df.shape)
df.head()

In [None]:
small=df.columns.values
for col in small:
    print(df.value_counts(col))
    print(' ')

# Replacing entries with more workable values

In [None]:
df['relationship'].replace('In a relationship', 'relationship', inplace=True)
df['relationship'].replace("It's complicated", 'complicated', inplace=True)
df['relationship'].replace("Single", 'single', inplace=True)

In [None]:
df['year'].replace('1st', 1, inplace=True)
df['year'].replace('2nd', 2, inplace=True)
df['year'].replace('3rd', 3, inplace=True)
df['year'].replace('4th', 4, inplace=True)

In [None]:
df['study'].replace('Pulling of all nighters :)', 'all_nighter', inplace=True)
df['study'].replace('Studying on a regular basis', 'regular', inplace=True)

In [None]:
df['CP'].replace("Yes", 1, inplace=True)
df['CP'].replace("No", 0, inplace=True)

In [None]:
df['hangout_spot'].replace("IIT G lake (in front of library)", "IITG lake", inplace=True)
df['hangout_spot'].replace("Serpentine Lake 😏", "Serpentine", inplace=True)
df['hangout_spot'].replace("Lake in front of the hospital", "Tihor lake", inplace=True)

In [None]:
df['state'].replace("The Government of NCT of Delhi", "Delhi", inplace=True)
df['dept'].replace("Dsai", "DSAI", inplace=True)

In [None]:
null_cpi = ["😂😂😂😂","noooooooooooo",'1st year Student. So CPI "0"','1','Bruh','1107.87','-','Currently in 1st sem','na','69','0','100','I prefer not to say', '11', '11.97', '10']
df['cpi'].replace(null_cpi, np.nan, inplace=True)
df['cpi'].replace('Between 8 and 9', 8.5, inplace=True)
df['cpi'].replace('8+', 8.1, inplace=True)
df['cpi'].replace('7+🤣', 7.1, inplace=True)
df['cpi'] = pd.to_numeric(df['cpi'])

In [None]:
df['hostel'].replace("Bhramaputra", "Brahmaputra", inplace=True)
df['hostel'].replace("Dibang", np.nan, inplace=True)

In [None]:
df[df['dept']=='HSS']

In [None]:
# dropping this fellow because he is just a menace
df = df[df['dept'] != 'HSS']

df= df.reset_index(drop=True).copy()

In [None]:
df['fav_club'].replace('ml.ai', 'iitg.ai', inplace=True)

In [None]:
pd.set_option('display.max_rows', 20)
print(df.value_counts(['sport']))

In [None]:
null_sports = ['Sex', 'none','Nothing','Bro?','No sport lol','People Gazing','Na','Playing with my mental health','Coding','Genshin Impact']
tt=['tt', 'table tennis ','table tennis','Table Tennis', 'Table Tennis ', 'Table tennis', 'Table tennis ']
df['sport'].replace(null_sports, np.nan, inplace=True)
df['sport'].replace(tt, 'TT', inplace=True)
df['sport'].replace('chess', 'Chess', inplace=True)
df['sport'].replace(['LT', 'Tennis '], 'Tennis', inplace=True)
df['sport'].replace(['GYMMING', 'GYMMING '], 'Gym', inplace=True)
df['sport'].replace(['Athletic', 'Athletics ', 'Athletic ', 'Running'], 'Athletics', inplace=True)
df['sport'].replace(['Cricket ', 'cricket', 'CRICKET', 'Bat-Ball'], 'Cricket', inplace=True)
df['sport'].replace(['Badminton lol', 'Badminton ', 'badminton'], 'Badminton', inplace=True)
df['sport'].replace(['Volleyball '], 'Volleyball', inplace=True)
df['sport'].replace(['basketball', 'Basketball ', 'basketball '], 'Basketball', inplace=True)
df['sport'].replace(['swimming ', 'Swimming ', 'swimming'], 'Swimming', inplace=True)
df['sport'].replace(['futbol', 'Football '], 'Football', inplace=True)
df['sport'].replace('Cricket, Football, Swimming, Tennis.', 'Cricket, Football, Swimming, Tennis', inplace=True)
df['sport'].replace('Cricket,badminton', 'Cricket, Badminton', inplace=True)
df['sport'].replace('Cricket,volleyball', 'Cricket, Volleyball', inplace=True)
df['sport'].replace('Swimming and badminton', 'Swimming, Badminton', inplace=True)
df['sport'].replace('to watch is cricket and basketball, to play is volleyball', 'Volleyball', inplace=True)

In [None]:
# Creating seperate data frame to work with the sports column only
sports_df = df['sport'].copy()
sports_df = sports_df[:].str.split(', ', expand=True).stack()
sports_df = sports_df.reset_index(level=1, drop=True).to_frame('sport')
print(sports_df.value_counts())

In [None]:
check2 = df.copy()

In [None]:
df[df['year']==1]

# Exploratory Data Ananlysis (EDA)

# We will be looking at some questions, not already mentioned in the reference notebook

## Analysing Competitive Programming Scene

In [None]:
print("What is the favourite programming language of competitive programmers throughout the years?\n")

plt.figure(figsize=(10,16))
    
plt.subplot(4,1,1)
plt.title('2nd Year')
sns.barplot(x=df[df['year'] == 2]['coding_lang'].value_counts().values, y=df[df['year'] == 2]['coding_lang'].value_counts().keys(), orient='h')
plt.ylabel('Preferred Coding Language')

plt.subplot(4,1,2)
plt.title('3rd Year')
sns.barplot(x=df[df['year'] == 3]['coding_lang'].value_counts().values, y=df[df['year'] == 3]['coding_lang'].value_counts().keys(), orient='h')
plt.ylabel('Preferred Coding Language')

plt.subplot(4,1,3)
plt.title('4th Year')
sns.barplot(x=df[df['year'] == 4]['coding_lang'].value_counts().values, y=df[df['year'] == 4]['coding_lang'].value_counts().keys(), orient='h')
plt.ylabel('Preferred Coding Language')

plt.subplot(4,1,4)
plt.title('Overall')
sns.barplot(x=df['coding_lang'].value_counts().values, y=df['coding_lang'].value_counts().keys(), orient='h')
plt.ylabel('Preferred Coding Language')

plt.show()



Some observations:-
* C++ seems to be the preferred language throughout the years for competitive programmers.
* Java and C lose their popularity among the higher year competitive programmers.

### Are our coders single?

In [None]:
plt.figure(figsize=(9,10))

plt.subplot(2,2,1)
plt.title('Overall')
sns.countplot(data=df[df['CP'] == 1], x='relationship', palette={'relationship': 'blue', 'single': 'red', 'complicated': 'orange'})
plt.xlabel('Relationship Status')

plt.subplot(2,2,2)
plt.title('2nd Year')
sns.countplot(data=df[(df['CP'] == 1) & (df['year']==2)], x='relationship', palette={'relationship': 'blue', 'single': 'red', 'complicated': 'orange'})
plt.xlabel('Relationship Status')

plt.subplot(2,2,3)
plt.title('3rd Year')
sns.countplot(data=df[(df['CP'] == 1) & (df['year']==3)], x='relationship', palette={'relationship': 'blue', 'single': 'red', 'complicated': 'orange'})
plt.xlabel('Relationship Status')

plt.subplot(2,2,4)
plt.title('4th Year')
sns.countplot(data=df[(df['CP'] == 1) & (df['year']==4)], x='relationship', palette={'relationship': 'blue', 'single': 'red', 'complicated': 'orange'})
plt.xlabel('Relationship Status')
# plt.legend()

Observations:-
* As expected, most of our competitive programmers are single.
* But those who are in a relationship can be found in 2nd and 4th years.
* The competitive programmers are not getting into complicated relations as the years increase

### How does department affect satisfaction and CPI?

### Does satisfaction affect CPI?

### Does satisfaction have anything to do with gender?