# Load needed libraries

In [1]:
import pandas as pd
import numpy as np

# Load data

Load data from [this](http://vincentarelbundock.github.io/Rdatasets/csv/carData/States.csv) address, [here](http://vincentarelbundock.github.io/Rdatasets/doc/carData/States.html) is data description.

Assign dataset to `education` variable.

In [2]:
education = pd.read_csv('http://vincentarelbundock.github.io/Rdatasets/csv/carData/States.csv')
education.rename({'Unnamed: 0': 'state'}, axis='columns', inplace=True)
education.head()

Unnamed: 0,state,region,pop,SATV,SATM,percent,dollars,pay
0,AL,ESC,4041,470,514,8,3.648,27
1,AK,PAC,550,438,476,42,7.887,43
2,AZ,MTN,3665,445,497,25,4.231,30
3,AR,WSC,2351,470,511,6,3.334,23
4,CA,PAC,29760,419,484,45,4.826,39


# 1. How many row and columns are in data?

In [3]:
# num_rows
education.shape[0]

51

In [4]:
# num_columns
education.shape[1]

8

# 2. See the first 10 entries

In [5]:
education.head(10)

Unnamed: 0,state,region,pop,SATV,SATM,percent,dollars,pay
0,AL,ESC,4041,470,514,8,3.648,27
1,AK,PAC,550,438,476,42,7.887,43
2,AZ,MTN,3665,445,497,25,4.231,30
3,AR,WSC,2351,470,511,6,3.334,23
4,CA,PAC,29760,419,484,45,4.826,39
5,CO,MTN,3294,456,513,28,4.809,31
6,CN,NE,3287,430,471,74,7.914,43
7,DE,SA,666,433,470,58,6.016,35
8,DC,SA,607,409,441,68,8.21,39
9,FL,SA,12938,418,466,44,5.154,30


# 3. List top 5 states with the highest average teacher's salary

In [6]:
education.sort_values(by='pay', ascending=False)[:5]

Unnamed: 0,state,region,pop,SATV,SATM,percent,dollars,pay
1,AK,PAC,550,438,476,42,7.887,43
6,CN,NE,3287,430,471,74,7.914,43
32,NY,MA,17990,412,470,70,8.5,42
4,CA,PAC,29760,419,484,45,4.826,39
8,DC,SA,607,409,441,68,8.21,39


# 4. Are there any missing values?

In [7]:
education.isnull().any()

state      False
region     False
pop        False
SATV       False
SATM       False
percent    False
dollars    False
pay        False
dtype: bool

# 5. Show descriptive statistics for all columns

In [8]:
education.describe(include='all')

Unnamed: 0,state,region,pop,SATV,SATM,percent,dollars,pay
count,51,51,51.0,51.0,51.0,51.0,51.0,51.0
unique,51,9,,,,,,
top,WY,SA,,,,,,
freq,1,9,,,,,,
mean,,,4876.647059,448.156863,497.392157,33.745098,5.17549,30.941176
std,,,5439.202691,30.821014,34.568817,24.073922,1.376166,5.308151
min,,,454.0,397.0,437.0,4.0,2.993,22.0
25%,,,1215.0,422.5,470.0,11.5,4.354,27.5
50%,,,3294.0,443.0,490.0,25.0,5.045,30.0
75%,,,5780.0,474.5,522.5,57.5,5.6895,33.5


# 6. Show 10 states with the lowest percentage of graduating high-school students who took the SAT exam.

In [9]:
education.sort_values(by='percent')[:10]

Unnamed: 0,state,region,pop,SATV,SATM,percent,dollars,pay
24,MS,ESC,2573,477,519,4,3.322,24
44,UT,MTN,1723,492,539,5,2.993,25
41,SD,WNC,696,506,555,5,3.73,22
15,IA,WNC,2777,511,577,5,4.839,28
3,AR,WSC,2351,470,511,6,3.334,23
34,ND,WNC,639,505,564,6,3.685,23
0,AL,ESC,4041,470,514,8,3.648,27
36,OK,WSC,3146,478,523,9,3.742,24
18,LA,WSC,4220,476,517,9,4.012,26
27,NE,WNC,1578,484,546,10,4.381,26


# 7. Filter states with population more or equal than 2000000

In [10]:
education_sub = education[education['pop'] >= 2000]
education_sub.head()

Unnamed: 0,state,region,pop,SATV,SATM,percent,dollars,pay
0,AL,ESC,4041,470,514,8,3.648,27
2,AZ,MTN,3665,445,497,25,4.231,30
3,AR,WSC,2351,470,511,6,3.334,23
4,CA,PAC,29760,419,484,45,4.826,39
5,CO,MTN,3294,456,513,28,4.809,31


# 8. How many states did you get fro the Question 7?

In [11]:
len(education_sub)

33

# 9. Using data from the Question 7., how many of them have average teacher's salary more or equal than 30000$

In [12]:
len(education_sub[education_sub.pay >= 30])

19

# 10. Show min, max, median and average percentage of graduating high-school students in the state who took the SAT exam in each US region

In [13]:
education.percent.min()

4

In [14]:
education.percent.max()

74

In [15]:
education.percent.median()

25.0

In [16]:
education.percent.mean()

33.745098039215684

In [17]:
education.percent.describe()

count    51.000000
mean     33.745098
std      24.073922
min       4.000000
25%      11.500000
50%      25.000000
75%      57.500000
max      74.000000
Name: percent, dtype: float64

# 11. BONUS: rename `region` column using following dictionary
```python
new_names = {
'ENC': 'East North Central', 'ESC' : 'East South Central', 'MA' : 'Mid-Atlantic', 'MTN' : 'Mountain',
'NE' : 'New England', 'PAC' : 'Pacific', 'SA' : 'South Atlantic', 'WNC' : 'West North Central', 
'WSC' : 'West South Central'
}
```

In [18]:
new_names = {
'ENC': 'East North Central', 'ESC' : 'East South Central', 'MA' : 'Mid-Atlantic', 'MTN' : 'Mountain',
'NE' : 'New England', 'PAC' : 'Pacific', 'SA' : 'South Atlantic', 'WNC' : 'West North Central', 
'WSC' : 'West South Central'
}

In [19]:
education['region'] = education['region'].map(new_names)
education.head()

Unnamed: 0,state,region,pop,SATV,SATM,percent,dollars,pay
0,AL,East South Central,4041,470,514,8,3.648,27
1,AK,Pacific,550,438,476,42,7.887,43
2,AZ,Mountain,3665,445,497,25,4.231,30
3,AR,West South Central,2351,470,511,6,3.334,23
4,CA,Pacific,29760,419,484,45,4.826,39
