In [382]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from scipy.stats import ttest_ind

pd.set_option('display.max_rows', 50) # показывать больше строк
pd.set_option('display.max_columns', 50) # показывать больше колонок

stud = pd.read_csv('stud_math.xls')
display(stud.head(10))
stud.info()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,"studytime, granular",higher,internet,romantic,famrel,freetime,goout,health,absences,score
0,GP,F,18,U,,A,4.0,4.0,at_home,teacher,course,mother,2.0,2.0,0.0,yes,no,no,no,yes,-6.0,yes,,no,4.0,3.0,4.0,3.0,6.0,30.0
1,GP,F,17,U,GT3,,1.0,1.0,at_home,other,course,father,1.0,2.0,0.0,no,yes,no,no,no,-6.0,yes,yes,no,5.0,3.0,3.0,3.0,4.0,30.0
2,GP,F,15,U,LE3,T,1.0,1.0,at_home,other,other,mother,1.0,2.0,3.0,yes,no,,no,yes,-6.0,yes,yes,,4.0,3.0,2.0,3.0,10.0,50.0
3,GP,F,15,U,GT3,T,4.0,2.0,health,,home,mother,1.0,3.0,0.0,no,yes,yes,yes,yes,-9.0,yes,yes,yes,3.0,2.0,2.0,5.0,2.0,75.0
4,GP,F,16,U,GT3,T,3.0,3.0,other,other,home,father,1.0,2.0,0.0,no,yes,yes,no,yes,-6.0,yes,no,no,4.0,3.0,2.0,5.0,4.0,50.0
5,GP,M,16,U,LE3,T,4.0,3.0,services,other,reputation,mother,1.0,2.0,0.0,no,yes,yes,yes,yes,-6.0,yes,yes,no,5.0,4.0,2.0,5.0,10.0,75.0
6,GP,M,16,,LE3,T,2.0,2.0,other,other,home,mother,1.0,2.0,0.0,no,no,no,no,yes,-6.0,yes,yes,no,4.0,4.0,4.0,3.0,0.0,55.0
7,GP,F,17,U,GT3,A,4.0,4.0,other,teacher,home,mother,2.0,2.0,0.0,yes,yes,no,no,yes,-6.0,yes,no,no,4.0,1.0,4.0,1.0,6.0,30.0
8,GP,M,15,U,LE3,A,3.0,2.0,services,other,home,mother,1.0,2.0,0.0,no,yes,yes,no,yes,-6.0,yes,yes,no,,2.0,2.0,1.0,0.0,95.0
9,GP,M,15,U,,,3.0,4.0,other,other,home,mother,1.0,2.0,0.0,no,yes,yes,yes,yes,-6.0,yes,yes,no,5.0,5.0,1.0,5.0,0.0,75.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   school               395 non-null    object 
 1   sex                  395 non-null    object 
 2   age                  395 non-null    int64  
 3   address              378 non-null    object 
 4   famsize              368 non-null    object 
 5   Pstatus              350 non-null    object 
 6   Medu                 392 non-null    float64
 7   Fedu                 371 non-null    float64
 8   Mjob                 376 non-null    object 
 9   Fjob                 359 non-null    object 
 10  reason               378 non-null    object 
 11  guardian             364 non-null    object 
 12  traveltime           367 non-null    float64
 13  studytime            388 non-null    float64
 14  failures             373 non-null    float64
 15  schoolsup            386 non-null    obj

Всего строк 395, но во многих столбцах присутствуют нулевые строки - их необходимо обработать
Столбец Medu и Fedu должен быть целого типа согласно описанию и принимать значения: 1, 2, 3 или 4

# Первичная обработка

Для удобства использования pandas переименуем все столбцы:

In [383]:
stud.columns = ['school', 'sex', 'age', 'address', 'famsize', 'pstatus', 'medu', 'fedu', 'mjob', 'fjob', 'reason', 'guardian', 
                'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 
                'studytime_granular', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'health', 'absences', 
                'score']
display(stud.head())

Unnamed: 0,school,sex,age,address,famsize,pstatus,medu,fedu,mjob,fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,studytime_granular,higher,internet,romantic,famrel,freetime,goout,health,absences,score
0,GP,F,18,U,,A,4.0,4.0,at_home,teacher,course,mother,2.0,2.0,0.0,yes,no,no,no,yes,-6.0,yes,,no,4.0,3.0,4.0,3.0,6.0,30.0
1,GP,F,17,U,GT3,,1.0,1.0,at_home,other,course,father,1.0,2.0,0.0,no,yes,no,no,no,-6.0,yes,yes,no,5.0,3.0,3.0,3.0,4.0,30.0
2,GP,F,15,U,LE3,T,1.0,1.0,at_home,other,other,mother,1.0,2.0,3.0,yes,no,,no,yes,-6.0,yes,yes,,4.0,3.0,2.0,3.0,10.0,50.0
3,GP,F,15,U,GT3,T,4.0,2.0,health,,home,mother,1.0,3.0,0.0,no,yes,yes,yes,yes,-9.0,yes,yes,yes,3.0,2.0,2.0,5.0,2.0,75.0
4,GP,F,16,U,GT3,T,3.0,3.0,other,other,home,father,1.0,2.0,0.0,no,yes,yes,no,yes,-6.0,yes,no,no,4.0,3.0,2.0,5.0,4.0,50.0


In [None]:
def changes(a):
    stud[a] = stud[a].astype(str).apply(lambda x: None if x.strip() == 'nan' else 0 if x.strip() == 'LE3' else 1 if x.strip() == 'GT3' else x)

In [384]:
stud = stud.dropna(axis='index', how='any', subset = stud.columns)

# School

In [385]:
print(stud.school.unique())
print(stud.school.value_counts())

['GP' 'MS']
GP    80
MS    17
Name: school, dtype: int64


Данный столбец указывает школу. Нулевые значения отсутствуют

# Sex

In [386]:
print(stud.sex.unique())
print(stud.sex.value_counts())

['F' 'M']
F    55
M    42
Name: sex, dtype: int64


Данный столбец указывает пол. Нулевые значения отсутствуют. В условии задания не сказано о гендерном разделении учеников, поэтому данный столбец не понадобится

In [387]:
stud.drop(['sex'], inplace = True, axis = 1)

# Age

In [388]:
print(stud.age.unique())
print(stud.age.value_counts())

[16 17 15 18 22 19 20]
18    24
17    24
16    22
15    19
19     6
22     1
20     1
Name: age, dtype: int64


В таблице представлены люди в возрасте от 15 до 22 лет, что соответствует условию задания

# Address

In [389]:
print(stud.address.unique())
display(stud.address.value_counts())

['U' 'R']


U    73
R    24
Name: address, dtype: int64

В данном столбце присутствуют нулевые элементы. То есть мы не можем узнать, откуда ученик

# Famsize

In [390]:
print(stud.famsize.unique())

['GT3' 'LE3']


Заменим значения 'nan' на None, а 'LE3' на 0 и 'GT3' на 1

In [391]:
stud.famsize = stud.famsize.astype(str).apply(lambda x: 0 if x.strip() == 'LE3' else (1 if x.strip() == 'GT3' else x))

In [392]:
print(stud.famsize.unique())
display(stud.famsize.value_counts())

[1 0]


1    66
0    31
Name: famsize, dtype: int64

# Pstatus

In [393]:
print(stud.pstatus.unique())

['T' 'A']


Заменим 'nan' на None, 'A' на 0 и 'T' на 1

In [394]:
stud.pstatus = stud.pstatus.astype(str).apply(lambda x: 0 if x.strip() == 'A' else (1 if x.strip() == 'T' else x))

In [395]:
print(stud.pstatus.unique())
display(stud.pstatus.value_counts())

[1 0]


1    86
0    11
Name: pstatus, dtype: int64

# Medu

In [396]:
print(stud.medu.unique())

[3. 4. 2. 1. 0.]


Заменим 'nan' на None

In [397]:
print(stud.medu.unique())
display(stud.medu.value_counts())

[3. 4. 2. 1. 0.]


4.0    36
2.0    26
3.0    25
1.0     9
0.0     1
Name: medu, dtype: int64

# Fedu

In [398]:
print(stud.fedu.unique())

[3. 4. 2. 1.]


Заменим 'nan' на None и '40' на '4'

In [399]:
stud.fedu = stud.fedu.astype(str).apply(lambda x: 4.0 if x.strip() == '40.0' else x)

In [400]:
print(stud.fedu.unique())
display(stud.fedu.value_counts())

['3.0' '4.0' '2.0' '1.0']


2.0    28
4.0    25
3.0    24
1.0    20
Name: fedu, dtype: int64

# Mjob

In [402]:
print(stud.mjob.unique())
display(stud.mjob.value_counts())

['other' 'services' 'teacher' 'at_home' 'health']


other       38
services    23
teacher     14
at_home     12
health      10
Name: mjob, dtype: int64

# Fjob

In [401]:
print(stud.mjob.unique())

['other' 'services' 'teacher' 'at_home' 'health']


Заменим 'nan' на None

In [404]:
print(stud.fjob.unique())
display(stud.fjob.value_counts())

['other' 'teacher' 'health' 'services' 'at_home']


other       53
services    27
teacher      8
health       5
at_home      4
Name: fjob, dtype: int64

# Reason

In [412]:
print(stud.reason.unique())
display(stud.reason.value_counts())

['home' 'reputation' 'course' 'other']


course        36
reputation    30
home          24
other          7
Name: reason, dtype: int64

# Guardian

In [408]:
print(stud.guardian.unique())
display(stud.guardian.value_counts())

['father' 'mother' 'other']


mother    68
father    21
other      8
Name: guardian, dtype: int64

In [403]:
print(stud.fjob.unique())

['other' 'teacher' 'health' 'services' 'at_home']


Заменим 'nan' на None

# Traveltime

In [409]:
print(stud.traveltime.unique())

[1. 2. 3. 4.]


Заменим 'nan' на None

In [410]:
print(stud.traveltime.unique())
display(stud.traveltime.value_counts())

[1. 2. 3. 4.]


1.0    61
2.0    30
3.0     5
4.0     1
Name: traveltime, dtype: int64

In [407]:
print(stud.guardian.unique())

['father' 'mother' 'other']


In [406]:
print(stud.reason.unique())
display(stud.reason.value_counts())

['home' 'reputation' 'course' 'other']


course        36
reputation    30
home          24
other          7
Name: reason, dtype: int64

Заменим 'nan' на None

Заменим 'nan' на None