# Анализ данных по датасету members

In [1]:
#Подгрузка библиотек
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [36]:
# при загрузке файла выберем только необходимые для работы столбцы и зададим типы данных при чтении файла
members_1 = pd.read_csv(r'members_1.csv',
                       usecols=['name',
                                'birthday',                                
                                'sex',
                                'profession',
                                'salary_usd$',
                                'group_id',
                                'entry_date',
                                'city_name',
                                'state_name',
                                'age'                    
                               ],
                       dtype={
                               'name':'str',
                               'birthday':'datetime64[ns]',
                               'sex':'category',
                               'profession': 'str',
                               'salary_usd$':'int64',
                               'group_id':'str',
                               'entry_date': 'datetime64[ns]',
                               'city_name': 'str',
                               'state_name': 'str',
                               'age': 'int64'
                              }, engine='python')
members_1

Unnamed: 0,name,birthday,sex,profession,salary_usd$,entry_date,group_id,city_name,state_name,age
0,Vanessa Casey,1960-08-08,F,"Engineer, manufacturing systems",1884,2018-05-24,19927387,Montgomery,West Virginia,63
1,Cheryl Carey,1958-12-11,N,Trade union research officer,1518,2019-11-14,19712846,Colonia,New Jersey,65
2,Kendra Valdez,1962-06-16,M,Press photographer,1669,2018-08-06,10528562,Pinckney,Michigan,61
3,Toni Baxter,1992-03-21,F,Emergency planning/management officer,3818,2017-07-18,1291778,Florence,Kentucky,31
4,Shaun Zuniga,1999-07-04,N,"Runner, broadcasting/film/video",4962,2017-11-18,26053931,Nickerson,Kansas,24
...,...,...,...,...,...,...,...,...,...,...
130984,George Mcdaniel,1943-03-14,M,Comptroller,2393,2018-04-14,1222759,Oakhurst,California,80
130985,Kevin Diaz DVM,1999-05-16,F,Customer service manager,4874,2018-12-27,2137821,Spring Lake Park,Minnesota,24
130986,Latasha Alvarez MD,1953-07-07,N,Interpreter,2579,2018-09-20,23343110,Newton County,Missouri,70
130987,Hector Davidson,1945-05-05,N,Lexicographer,2890,2018-11-18,10982362,Houston,Alaska,78


In [37]:
members_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130989 entries, 0 to 130988
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   name         130989 non-null  object        
 1   birthday     130989 non-null  datetime64[ns]
 2   sex          130989 non-null  category      
 3   profession   130989 non-null  object        
 4   salary_usd$  130989 non-null  int64         
 5   entry_date   130989 non-null  datetime64[ns]
 6   group_id     130989 non-null  object        
 7   city_name    130989 non-null  object        
 8   state_name   130989 non-null  object        
 9   age          130989 non-null  int64         
dtypes: category(1), datetime64[ns](2), int64(2), object(5)
memory usage: 9.1+ MB


In [38]:
# скачаем таблицу 
hobby_list_1 = pd.read_csv('hobby_list_1.csv', dtype={'group_id':'str', 'category_id':'str'})

In [39]:
hobby_list_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8035 entries, 0 to 8034
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   group_id          8035 non-null   object
 1   name              8034 non-null   object
 2   category_id       8035 non-null   object
 3   monthly_fee_USD$  8035 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 251.2+ KB


In [40]:
# созданим левое соединение members_1 и hobby_list_1 по набору groups_id в новый фрейм
members_11 = members_1.merge(hobby_list_1, how = 'left', on = 'group_id')
members_11.head(3)

Unnamed: 0,name_x,birthday,sex,profession,salary_usd$,entry_date,group_id,city_name,state_name,age,name_y,category_id,monthly_fee_USD$
0,Vanessa Casey,1960-08-08,F,"Engineer, manufacturing systems",1884,2018-05-24,19927387,Montgomery,West Virginia,63,Events That Make You Smarter,6,7
1,Cheryl Carey,1958-12-11,N,Trade union research officer,1518,2019-11-14,19712846,Colonia,New Jersey,65,Mix Professionals Chicago,9,18
2,Kendra Valdez,1962-06-16,M,Press photographer,1669,2018-08-06,10528562,Pinckney,Michigan,61,Madison Square Park Mommies!,25,19


In [42]:
# также проведем слияние в новый фрейм данных из members_11 и categories
members_12 = members_11.merge(pd.read_csv('categories_1.csv', dtype = {'category_id':'str'}), how = 'left', on = 'category_id')
members_12

Unnamed: 0,name_x,birthday,sex,profession,salary_usd$,entry_date,group_id,city_name,state_name,age,name_y,category_id,monthly_fee_USD$,category_name
0,Vanessa Casey,1960-08-08,F,"Engineer, manufacturing systems",1884,2018-05-24,19927387,Montgomery,West Virginia,63,Events That Make You Smarter,6,7,Education & Learning
1,Cheryl Carey,1958-12-11,N,Trade union research officer,1518,2019-11-14,19712846,Colonia,New Jersey,65,Mix Professionals Chicago,9,18,Fitness
2,Kendra Valdez,1962-06-16,M,Press photographer,1669,2018-08-06,10528562,Pinckney,Michigan,61,Madison Square Park Mommies!,25,19,Parents & Family
3,Toni Baxter,1992-03-21,F,Emergency planning/management officer,3818,2017-07-18,1291778,Florence,Kentucky,31,San Francisco Dodgeball Meetup Group,32,1,Sports & Recreation
4,Shaun Zuniga,1999-07-04,N,"Runner, broadcasting/film/video",4962,2017-11-18,26053931,Nickerson,Kansas,24,Passionate Concert Goers,21,9,Music
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130984,George Mcdaniel,1943-03-14,M,Comptroller,2393,2018-04-14,1222759,Oakhurst,California,80,RHYTHMflow Explorations,21,21,Music
130985,Kevin Diaz DVM,1999-05-16,F,Customer service manager,4874,2018-12-27,2137821,Spring Lake Park,Minnesota,24,Aquatic Voyagers Scuba Club,23,18,Outdoors & Adventure
130986,Latasha Alvarez MD,1953-07-07,N,Interpreter,2579,2018-09-20,23343110,Newton County,Missouri,70,Chicago Womens Soccer Meet up,32,5,Sports & Recreation
130987,Hector Davidson,1945-05-05,N,Lexicographer,2890,2018-11-18,10982362,Houston,Alaska,78,Chicago Indonesian (Bahasa Indonesia) Language...,16,23,Language & Ethnic Identity


In [43]:
members_12.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130989 entries, 0 to 130988
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   name_x            130989 non-null  object        
 1   birthday          130989 non-null  datetime64[ns]
 2   sex               130989 non-null  category      
 3   profession        130989 non-null  object        
 4   salary_usd$       130989 non-null  int64         
 5   entry_date        130989 non-null  datetime64[ns]
 6   group_id          130989 non-null  object        
 7   city_name         130989 non-null  object        
 8   state_name        130989 non-null  object        
 9   age               130989 non-null  int64         
 10  name_y            130974 non-null  object        
 11  category_id       130989 non-null  object        
 12  monthly_fee_USD$  130989 non-null  int64         
 13  category_name     130989 non-null  object        
dtypes: c

In [44]:
members_12 = members_12.rename(columns = {'name_x':'member_name', 'name_y':'fun_club_name'})
members_12.head(3)

Unnamed: 0,member_name,birthday,sex,profession,salary_usd$,entry_date,group_id,city_name,state_name,age,fun_club_name,category_id,monthly_fee_USD$,category_name
0,Vanessa Casey,1960-08-08,F,"Engineer, manufacturing systems",1884,2018-05-24,19927387,Montgomery,West Virginia,63,Events That Make You Smarter,6,7,Education & Learning
1,Cheryl Carey,1958-12-11,N,Trade union research officer,1518,2019-11-14,19712846,Colonia,New Jersey,65,Mix Professionals Chicago,9,18,Fitness
2,Kendra Valdez,1962-06-16,M,Press photographer,1669,2018-08-06,10528562,Pinckney,Michigan,61,Madison Square Park Mommies!,25,19,Parents & Family


In [59]:
# подсчитаем общее количество членов клубов в разрезе категорий интересов - топ 5
members_12.groupby('category_name')['member_name'].count().sort_values(ascending=False).head(5)

category_name
Health & Wellbeing            14174
Language & Ethnic Identity    10788
New Age & Spirituality         9085
Outdoors & Adventure           8730
Sports & Recreation            8277
Name: member_name, dtype: int64

In [81]:
members_12.groupby('category_name').agg({'salary_usd$':'sum'}).sort_values(by = 'salary_usd$', ascending=False).head(5)

Unnamed: 0_level_0,salary_usd$
category_name,Unnamed: 1_level_1
Health & Wellbeing,46179622
Language & Ethnic Identity,34957202
New Age & Spirituality,29713224
Outdoors & Adventure,28251490
Sports & Recreation,26793115
