In [275]:
import pandas as pd
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt
from scipy import stats as stats
import seaborn as sns
%matplotlib inline

### Business problem 1

In [137]:
loan_data = pd.read_csv('LoansData.csv')

In [185]:
loan_data['Loan_Purpose'].unique

<bound method Series.unique of 0       debt_consolidation
1       debt_consolidation
2       debt_consolidation
3       debt_consolidation
4              credit_card
               ...        
2495    debt_consolidation
2496      home_improvement
2497    debt_consolidation
2498        major_purchase
2499    debt_consolidation
Name: Loan_Purpose, Length: 2500, dtype: object>

In [159]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Amount_Requested                2499 non-null   float64
 1   Amount_Funded_By_Investors      2499 non-null   float64
 2   Interest_Rate                   2498 non-null   float64
 3   Loan_Length                     2500 non-null   int64  
 4   Loan_Purpose                    2500 non-null   object 
 5   Debt_To_Income_Ratio            2499 non-null   object 
 6   State                           2500 non-null   object 
 7   Home_Ownership                  2499 non-null   object 
 8   Monthly_Income                  2499 non-null   float64
 9   FICO_Range                      2498 non-null   object 
 10  Open_CREDIT_Lines               2497 non-null   float64
 11  Revolving_CREDIT_Balance        2497 non-null   float64
 12  Inquiries_in_the_Last_6_Months  24

In [141]:
# rename to appropriate column name
loan_data.columns = loan_data.apply(lambda x:x.name.replace(".","_"))

In [143]:
#load_data['Employment.Length']=
loan_data['Employment_Length']=pd.to_numeric(loan_data['Employment_Length'].str.rstrip("+ years").str.lstrip("<>"))
loan_data['Loan_Length'] = pd.to_numeric(loan_data['Loan_Length'].str.rstrip('months'))

In [145]:
#Replace the null values of Employment length by mean of the column values
loan_data['Employment_Length']=np.where(loan_data['Employment_Length'].isna(),loan_data['Employment_Length'].mean(),loan_data['Employment_Length'])

In [92]:
loan_data.isna().sum()

Amount_Requested                  1
Amount_Funded_By_Investors        1
Interest_Rate                     2
Loan_Length                       0
Loan_Purpose                      0
Debt_To_Income_Ratio              1
State                             0
Home_Ownership                    1
Monthly_Income                    1
FICO_Range                        2
Open_CREDIT_Lines                 3
Revolving_CREDIT_Balance          3
Inquiries_in_the_Last_6_Months    3
Employment_Length                 0
dtype: int64

In [147]:
loan_data['Interest_Rate']=pd.to_numeric(loan_data['Interest_Rate'].str.rstrip("%"))

In [149]:
loan_data.columns

Index(['Amount_Requested', 'Amount_Funded_By_Investors', 'Interest_Rate',
       'Loan_Length', 'Loan_Purpose', 'Debt_To_Income_Ratio', 'State',
       'Home_Ownership', 'Monthly_Income', 'FICO_Range', 'Open_CREDIT_Lines',
       'Revolving_CREDIT_Balance', 'Inquiries_in_the_Last_6_Months',
       'Employment_Length'],
      dtype='object')

In [151]:
loan_data.drop_duplicates(inplace =True)

In [161]:
loan_data_cat_col = loan_data.select_dtypes('object').columns
# there are no integer column selecting only float dtype
loan_data_con_col = loan_data.select_dtypes(['float64','int64']).columns

In [163]:
loan_data_cat_col

Index(['Loan_Purpose', 'Debt_To_Income_Ratio', 'State', 'Home_Ownership',
       'FICO_Range'],
      dtype='object')

In [165]:
loan_data_con_col

Index(['Amount_Requested', 'Amount_Funded_By_Investors', 'Interest_Rate',
       'Loan_Length', 'Monthly_Income', 'Open_CREDIT_Lines',
       'Revolving_CREDIT_Balance', 'Inquiries_in_the_Last_6_Months',
       'Employment_Length'],
      dtype='object')

In [167]:
# handling missing values in categorical variable columns
for column in loan_data_cat_col:
    loan_data[column] = np.where(loan_data[column].isna(),loan_data[column].mode()[0],loan_data[column])

In [169]:
# handling missing values in continuous variable columns
for column in loan_data_con_col:
    loan_data[column] = np.where(loan_data[column].isna(),loan_data[column].mean(),loan_data[column])

In [171]:
# outlier treatment
for col in loan_data_con_col:
        loan_data[col] = np.where(loan_data[col] > loan_data[col].quantile(0.99) ,
                                    loan_data[col].quantile(0.99), loan_data[col])

In [173]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Amount_Requested                2500 non-null   float64
 1   Amount_Funded_By_Investors      2500 non-null   float64
 2   Interest_Rate                   2500 non-null   float64
 3   Loan_Length                     2500 non-null   float64
 4   Loan_Purpose                    2500 non-null   object 
 5   Debt_To_Income_Ratio            2500 non-null   object 
 6   State                           2500 non-null   object 
 7   Home_Ownership                  2500 non-null   object 
 8   Monthly_Income                  2500 non-null   float64
 9   FICO_Range                      2500 non-null   object 
 10  Open_CREDIT_Lines               2500 non-null   float64
 11  Revolving_CREDIT_Balance        2500 non-null   float64
 12  Inquiries_in_the_Last_6_Months  25

#### a. Interest rate is varied for different loan amounts (Less interest charged for high loan amounts) 

##### i) Define null and alternate hypothesis:
###### Ho - Interest rate has no dependancy on loan amounts
###### Ha  - Interest rate depends on loan amount

##### ii) At CI =95% p value is 0.05
##### iii) Perform hypothesis test:
##### Pearsonr is used to perform the test


In [175]:
stats.pearsonr(loan_data['Interest_Rate'],loan_data['Amount_Funded_By_Investors'])

PearsonRResult(statistic=0.3356471334323706, pvalue=6.933560221720508e-67)

##### iv) Rejection rule:
######        New p value is lesser than the p value, so Rejecting the null hypothesis.

##### v) Business conclusion:
######       There is corelation between Interest rate and Loan amount. 

#### b. Loan length is directly effecting intrest rate. 

In [None]:
##### i) Define null and alternate hypothesis:
###### Ho - Interest rate has no dependancy on Loan length
###### Ha  - Interest rate depends on loan length

##### ii) At CI =95% p value is 0.05
##### iii) Perform hypothesis test:
##### pearsonr is used to perform the test,

In [177]:
stats.pearsonr(loan_data['Loan_Length'],loan_data['Interest_Rate'])

PearsonRResult(statistic=0.4229100768542532, pvalue=4.873263336758549e-109)

##### iv) Rejection rule:
######        New p value is lesser than the p value, so Rejecting the null hypothesis.
##### v) Business conclusion:
######       There is corelation between Loan length and Interest rate.

#### c. Inrest rate varies for different purpose of loans

##### i) Define null and alternate hypothesis:
###### Ho - Interest rate has no dependancy on Purpose of loan
###### Ha  - Interest rate depends on Purpose of loan

##### ii) At CI =95% p value is 0.05
##### iii) Perform hypothesis test:
#####  ftest is used to perform the test,

In [333]:
s1 = loan_data.loc[ loan_data.Loan_Purpose == 'debt_consolidation', 'Interest_Rate' ]
s2 = loan_data.loc[ loan_data.Loan_Purpose == 'credit_card', 'Interest_Rate' ]
s3 = loan_data.loc[ loan_data.Loan_Purpose == 'other', 'Interest_Rate' ]
s4 = loan_data.loc[ loan_data.Loan_Purpose == 'moving', 'Interest_Rate' ]
s5 = loan_data.loc[ loan_data.Loan_Purpose == 'car', 'Interest_Rate' ]
s6 = loan_data.loc[ loan_data.Loan_Purpose == 'vacation', 'Interest_Rate' ]
s7 = loan_data.loc[ loan_data.Loan_Purpose == 'home_improvement', 'Interest_Rate' ]
s8 = loan_data.loc[ loan_data.Loan_Purpose == 'house', 'Interest_Rate' ]
s9 = loan_data.loc[ loan_data.Loan_Purpose == 'major_purchase', 'Interest_Rate' ]
s10 = loan_data.loc[ loan_data.Loan_Purpose == 'educational', 'Interest_Rate' ]
s11 = loan_data.loc[ loan_data.Loan_Purpose == 'medical', 'Interest_Rate' ]
s12 = loan_data.loc[ loan_data.Loan_Purpose == 'wedding', 'Interest_Rate' ]
s13 = loan_data.loc[ loan_data.Loan_Purpose == 'small_business', 'Interest_Rate' ]
s14 = loan_data.loc[ loan_data.Loan_Purpose == 'renewable_energy', 'Interest_Rate' ]

In [335]:
stats.f_oneway(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14)

F_onewayResult(statistic=7.480187740702924, pvalue=1.1758940981686626e-14)

##### iv) Rejection Rules:
###### Since the new  p-value > 0.05, we accept the H0.
##### v) Business Conclusion:
###### Based on the above analysis we can conclude that the interest has no dependancy on purpose of loan.

#### d.There is relationship between FICO scores and Home Ownership. It means that, People with owning home will have high FICO scores. 

##### 1. Define Null Hypothesis, H0: 
            There is no relationship b/w FICO & Home Ownership
##### 2. Define Alternate Hypothesis, Ha:
           There is a relationship b/w FICO & Home Ownership
##### 3. At CI = 95%:
            p-value = 0.05
            p-cutoff = 1.967
##### 4. Performing the test:         
            chi-square test

In [391]:
### observed frequency table:
#obs_frq = pd.crosstab(index = loan_data['FICO_Range'], columns =loan_data['Home_Ownership'])
obs_frq = pd.crosstab(loan_data['Home_Ownership'], loan_data['FICO_Range'])
obs_frq

FICO_Range,640-644,645-649,650-654,655-659,660-664,665-669,670-674,675-679,680-684,685-689,...,780-784,785-789,790-794,795-799,800-804,805-809,810-814,815-819,820-824,830-834
Home_Ownership,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MORTGAGE,0,1,1,0,41,52,68,79,55,61,...,21,12,15,7,7,6,5,6,0,1
NONE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
OTHER,0,0,0,1,0,0,1,1,2,0,...,0,0,0,0,0,0,0,0,0,0
OWN,1,0,0,0,17,18,13,11,9,10,...,3,2,1,1,3,2,1,0,1,0
RENT,4,2,0,3,67,75,91,75,91,65,...,4,5,4,5,2,3,2,0,0,0


In [399]:
stats.chi2_contingency(obs_frq)[1]

1.0065160858471904e-35

##### 5.Rejection rule:
            The new Pvalue is lesser and Hence we reject H0!.
##### 6.Conclusion
           There is a relationship between Home ownership and FICO score.

## BUSINESS PROBLEM - 2 

#### BUSINESS PROBLEM: We would like to assess if there is any difference in the average price quotes provided by Mary and Barry. 

In [None]:
1. Define Null Hypothesis, H0:
        There is no difference in average price quotes provided by Mary and Barry.
2. Define Alternate Hypothesis, Ha:
         There is a difference in average price quotes provided by Mary and Barry.
        
3. At CI = 95%:
        p-value = 0.05

4. Performing the test:
        T-test 

In [5]:
price_quotes = pd.read_csv('Price_Quotes.csv')

In [7]:
price_quotes

Unnamed: 0,Order_Number,Barry_Price,Mary_Price
0,1,126,114
1,2,110,118
2,3,138,114
3,4,142,111
4,5,146,129
5,6,136,119
6,7,94,97
7,8,103,104
8,9,140,127
9,10,152,133


In [20]:
price_quotes.describe()

Unnamed: 0,Order_Number,Barry_Price,Mary_Price
count,12.0,12.0,12.0
mean,6.5,124.333333,114.75
std,3.605551,20.698412,11.054616
min,1.0,94.0,97.0
25%,3.75,106.75,107.0
50%,6.5,131.0,114.0
75%,9.25,140.5,121.0
max,12.0,152.0,133.0


In [9]:
stats.ttest_rel(price_quotes.Barry_Price,price_quotes.Mary_Price)

TtestResult(statistic=2.5213765108923494, pvalue=0.02840588045242053, df=11)

##### 5. Rejection Rules:
    Since the new  p-value < 0.05, we reject the H0!

##### 6. Business Conclusion:
      Based on the above analysis we can conclude that there is a difference in average price quoted by Mary and Barry.

### Problem Statement 3:
    BUSINESS PROBLEM: Determine what effect, if any, the reengineering effort had on the incidence behavioral problems and staff turnover. i.e To determine if the reengineering effort changed the critical incidence rate. Isthere evidence that the critical incidence rate improved?

In [278]:
df_ngo=pd.read_csv('Treatment_facility.csv')
df_ngo.rename(columns=({'VAR4':'TRFF','VAR5':'CI'}),inplace =True)

In [280]:
df_ngo

Unnamed: 0,Month,Reengineer,Employee_Turnover,TRFF,CI
0,1,Prior,0.0,24.390244,42.682927
1,2,Prior,6.0606,19.354839,25.806452
2,3,Prior,12.1212,35.087719,146.19883
3,4,Prior,3.3333,18.404908,110.429448
4,5,Prior,12.9032,17.964072,23.952096
5,6,Prior,9.6774,41.176471,47.058824
6,7,Prior,11.7647,13.422819,0.0
7,8,Prior,11.4286,31.25,25.0
8,9,Prior,23.0769,17.241379,132.183908
9,10,Prior,15.0,16.574586,16.574586


In [296]:
prior_mean = df_ngo.loc[df_ngo.Reengineer=='Prior','CI']
post_mean = df_ngo.loc[df_ngo.Reengineer=='Post','CI']

##### 1. Define Null Hypothesis, H0:
         There is no effect for re-engineering which dint reduce the critical incidence rate. 
##### 2. Define Alternate Hypothesis, Ha:
         There is an effect for re-engineering which reduces the critical incidence rate.       
##### 3. At CI = 95%:
         p-value = 0.05
##### 4. Performing the test:
          T-test 

In [298]:
print(stats.ttest_ind(prior_mean,post_mean))

TtestResult(statistic=1.627914425352865, pvalue=0.12091989189884148, df=18.0)


##### 5. Rejection Rules:
    Since the new  p-value > 0.05, we accept the H0.
##### 6. Business Conclusion:
      Based on the above analysis we can conclude that there is no effect on reengineering 

## BUSINESS PROBLEM - 4

 We will focus on the prioritization system. If the system is working, then 
high priority jobs, on average, should be completed more quickly than medium priority jobs, 
and medium priority jobs should be completed more quickly than low priority jobs. Use the 
data provided to determine whether this is, in fact, occurring.

In [321]:
priority_df = pd.read_csv('Priority_Assessment.csv')
priority_df

high_mean = priority_df.loc[priority_df.Priority == 'High','Days']
medium_mean = priority_df.loc[priority_df.Priority == 'Medium','Days']
low_mean = priority_df.loc[priority_df.Priority == 'Low','Days']


In [None]:
##### 1. Define Null Hypothesis, H0:
         There is no effect in priority assignment
##### 2. Define Alternate Hypothesis, Ha:
         There is an effect for priority assignment       
##### 3. At CI = 95%:
         p-value = 0.05
##### 4. Performing the test:
          ftest

In [323]:
stats.f_oneway(high_mean,medium_mean,low_mean)

F_onewayResult(statistic=1.812311010076072, pvalue=0.16411459461716182)

##### 5. Rejection Rules:
        Since the new  p-value > 0.05, we accept the H0.
##### 6. Business Conclusion:
        Based on the above analysis we can conclude that there is no effect on prioritization system

### BUSINESS PROBLEM 5: Use the survey results to address the following questions 
        What is the overall level of customer satisfaction? 
        What factors are linked to satisfaction? 
        What is the demographic profile of Film on the Rocks patrons? 
        In what media outlet(s) should the film series be advertised?

In [707]:
films_df = pd.read_csv('Films.csv')
films_df

Unnamed: 0,_rowstate_,Movie,Gender,Marital_Status,Sinage,Parking,Clean,Overall,Age,Income,Hear_About
0,0,Ferris Buellers Day Off,Female,Married,2.0,2.0,2.0,2.0,3.0,1.0,5
1,0,Ferris Buellers Day Off,Female,Single,1.0,1.0,1.0,1.0,2.0,1.0,5
2,0,Ferris Buellers Day Off,Male,Married,2.0,4.0,3.0,2.0,4.0,1.0,5
3,0,Ferris Buellers Day Off,Female,Married,1.0,3.0,2.0,2.0,4.0,1.0,5
4,0,Ferris Buellers Day Off,Female,Married,1.0,1.0,1.0,1.0,3.0,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...
325,0,Old School,2,2,1.0,2.0,1.0,1.0,2.0,1.0,1
326,0,Old School,1,1,2.0,2.0,2.0,2.0,3.0,3.0,5
327,0,Old School,2,1,2.0,1.0,1.0,2.0,2.0,2.0,5
328,0,Old School,1,1,1.0,1.0,1.0,1.0,2.0,1.0,3


In [709]:
films_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330 entries, 0 to 329
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   _rowstate_      330 non-null    int64  
 1   Movie           330 non-null    object 
 2   Gender          330 non-null    object 
 3   Marital_Status  328 non-null    object 
 4   Sinage          328 non-null    float64
 5   Parking         328 non-null    float64
 6   Clean           327 non-null    float64
 7   Overall         328 non-null    float64
 8   Age             328 non-null    float64
 9   Income          314 non-null    float64
 10  Hear_About      323 non-null    object 
dtypes: float64(6), int64(1), object(4)
memory usage: 28.5+ KB


In [711]:
films_df['Gender'] = np.where((films_df.Gender=='Female')|(films_df.Gender=='2'),'2','1')
films_df['Marital_Status'] = np.where((films_df.Marital_Status=='Married')|(films_df.Marital_Status=='1'),'1','2')

In [713]:
for col in films_df:
    films_df[col] = np.where(films_df[col].isna(),films_df[col].mode()[0],films_df[col])

In [715]:
films_df.isna().sum()

_rowstate_        0
Movie             0
Gender            0
Marital_Status    0
Sinage            0
Parking           0
Clean             0
Overall           0
Age               0
Income            0
Hear_About        0
dtype: int64

### 1.What is the overall level of customer satisfaction? 

In [718]:
films_df.Overall.value_counts()/films_df.Overall.value_counts().sum()

Overall
2.0    0.490909
1.0    0.457576
3.0    0.036364
5.0    0.012121
4.0    0.003030
Name: count, dtype: float64

In [720]:
## The overall 90 percent of customer satisfaction is level 1 and 2 which is Excellent and good respectively.

### 2.What factors are linked to satisfaction? 
the factors which are related to satisfaction are Signage,Parking,Clean,Overall

#### Signage and overall satisfaction,
##### 1. Define Null Hypothesis, H0:
         Signage and overall staisfaction are not related
##### 2. Define Alternate Hypothesis, Ha:
         Signage and overall staisfaction are  related   
##### 3. At CI = 95%:
         p-value = 0.05
##### 4. Performing the test:
          chisquare test


In [724]:
obs_freq = pd.crosstab(films_df.Sinage,films_df.Overall)
obs_freq
print('p-value',stats.chi2_contingency(obs_freq)[1])


p-value 4.53298970003405e-19


##### 5. Rejection Rules:
        Since the new  p-value < 0.05, we reject H0
##### 6. Business Conclusion:
        Based on the above analysis we can conclude that Signage and overall satisfaction are dependant

#### Parking and overall satisfaction,
##### 1. Define Null Hypothesis, H0:
         parking and overall staisfaction are not related
##### 2. Define Alternate Hypothesis, Ha:
         parking and overall staisfaction are  related   
##### 3. At CI = 95%:
         p-value = 0.05
##### 4. Performing the test:
          chisquare test

In [728]:
obs_freq = pd.crosstab(films_df.Parking,films_df.Overall)
print('p-value',stats.chi2_contingency(obs_freq)[1])

p-value 4.783162283608494e-40


##### 5. Rejection Rules:
        Since the new  p-value < 0.05, we reject H0
##### 6. Business Conclusion:
        Based on the above analysis we can conclude that parking and overall satisfaction are dependant

#### clean and overall satisfaction,
##### 1. Define Null Hypothesis, H0:
         cleanliness and overall staisfaction are not related
##### 2. Define Alternate Hypothesis, Ha:
         cleanliness and overall staisfaction are  related   
##### 3. At CI = 95%:
         p-value = 0.05
##### 4. Performing the test:
          chisquare test

In [732]:
obs_freq = pd.crosstab(films_df.Clean,films_df.Overall)
print('p-value',stats.chi2_contingency(obs_freq)[1])

p-value 1.7772535915434025e-19


##### 5. Rejection Rules:
        Since the new  p-value < 0.05, we reject H0
##### 6. Business Conclusion:
        Based on the above analysis we can conclude that cleanliness and overall satisfaction are dependant

#### 3.What is the demographic profile of Film on the Rocks patrons?
With the given data the demopraphic profile includes age,gender and income of the people. 

In [736]:
# Gender
Gender = films_df.groupby('Gender')['Gender'].value_counts()
Gender= Gender.rename({'Gender':'Count'}).reset_index()
Gender['percent'] = Gender['count']/Gender['count'].sum()
print(Gender)
#The patron’s gender: 1 = male; 2 = female 
print(" the 64 % patrons are female and remaining are male")

  Gender  count   percent
0      1    117  0.354545
1      2    213  0.645455
 the 64 % patrons are female and remaining are male


In [738]:
# Age
Age = films_df.groupby('Age')['Age'].value_counts()
Age= Age.rename({'Age':'Count'}).reset_index()
Age['percent'] = Age['count']/Age['count'].sum()
print(Age)

# The patron’s age in years: 1= 1-12; 2 = 13-30; 3 = 31-60; 4 = 60+ 
print ("the 53% Age of patrons lies from 13 to 30 and 35% lies between 31 and 60. The remaining are either below 12 or above 60")

   Age  count   percent
0  1.0     26  0.078788
1  2.0    177  0.536364
2  3.0    117  0.354545
3  4.0     10  0.030303
the 53% Age of patrons lies from 13 to 30 and 35% lies between 31 and 60. The remaining are either below 12 or above 60


In [740]:
# Income 
Income = films_df.groupby('Income')['Income'].value_counts()
Income= Income.rename({'Income':'Count'}).reset_index()
Income['percent'] = Income['count']/Income['count'].sum()
#The patron’s annual household income: 1 = Less than $50,000; 2 = $50,000- $100,000; 3 = $100,000+
print(Income)
print("47% of patrons are with income less than $50000")

   Income  count   percent
0     1.0    158  0.478788
1     2.0     82  0.248485
2     3.0     90  0.272727
47% of patrons are with income less than $50000


### 4.In what media outlet(s) should the film series be advertised?

In [743]:
films_df.Hear_About.value_counts()

Hear_About
5      233
4       41
1       22
3       14
2       12
2,5      2
3,4      2
4,5      1
1,5      1
5,4      1
3,5      1
Name: count, dtype: int64

In [745]:
Hear_about = films_df.groupby('Hear_About')['Hear_About'].value_counts()
Hear_about= Hear_about.rename({'Hear_About':'Count'}).reset_index()
Hear_about['percent'] = Hear_about['count']/Hear_about['count'].sum()
print(Hear_about.sort_values('percent',ascending = False))

#1 = television; 2 = newspaper; 3 = radio; 4 = website; 5 = word of mouth
print("70% of Respondents got to know by 'word of mount' ")

   Hear_About  count   percent
9           5    233  0.706061
7           4     41  0.124242
0           1     22  0.066667
4           3     14  0.042424
2           2     12  0.036364
3         2,5      2  0.006061
5         3,4      2  0.006061
1         1,5      1  0.003030
6         3,5      1  0.003030
8         4,5      1  0.003030
10        5,4      1  0.003030
70% of Respondents got to know by 'word of mount' 
