In [1]:
import pandas as pd
import numpy as np

from random import sample
from scipy import stats

In [5]:
file = 'GM23_Master Sales Table (FINAL) - with regressions.xlsx'

In [202]:
base_data = (pd.read_excel(file)
             .replace('no data', np.nan))

relevant_data = base_data[['Company', 'Name of NME Approved', 'Molecule ID', 
           'Year Approved With Increment', 'Shared/Non-Shared', 
           'Self-Originated', 'Molecule Type', 'Novel', 'Oncology', 
           'Year 5 Sales - Inf']]

#filter to just mol approved in 16-20, drop rows with no data for year 5 sales

data_12_16 = relevant_data[(relevant_data['Year Approved With Increment'] >= 2016) & 
                           (relevant_data['Year Approved With Increment'] <= 2020)]
data_12_16 = data_12_16.dropna(axis=0, subset=['Year 5 Sales - Inf'])

# add 1 to every year 5 sales value and take log
data_12_16['log_y5s'] = (data_12_16['Year 5 Sales - Inf']+1).transform(lambda x : np.log(x))
data_12_16['log_y5s'] = [x-1 for x in data_12_16['log_y5s']]

### T-Test Calculations

In [203]:
def run_ttest(dataset, column, cond_1, cond_2):
    print(dataset[column].value_counts())
    return stats.ttest_ind(dataset['log_y5s'][dataset[column] == cond_1], 
                    dataset['log_y5s'][dataset[column] == cond_2])

#### Sharing

In [204]:
#get rid of caps bc there's like 6 that aren't capped
data_12_16['Shared/Non-Shared'] = [x.lower() for x in data_12_16['Shared/Non-Shared']]

In [205]:
run_ttest(data_12_16, 'Shared/Non-Shared', 'shared', 'non-shared')

non-shared    55
shared        34
Name: Shared/Non-Shared, dtype: int64


Ttest_indResult(statistic=0.1999716096533118, pvalue=0.8419694689640551)

#### Origin

In [206]:
run_ttest(data_12_16, 'Self-Originated', 1, 0)

0    56
1    33
Name: Self-Originated, dtype: int64


Ttest_indResult(statistic=0.6658610942624632, pvalue=0.5072615676270846)

#### Molecule Size

In [207]:
run_ttest(data_12_16, 'Molecule Type', 'Small', 'Large')

Small    53
Large    36
Name: Molecule Type, dtype: int64


Ttest_indResult(statistic=-1.3492592367057572, pvalue=0.1807547115620596)

#### Novelty

In [208]:
run_ttest(data_12_16, 'Novel', 0, 1)

1.0    35
0.0    23
Name: Novel, dtype: int64


Ttest_indResult(statistic=0.912592791468316, pvalue=0.36536892976040203)

#### Oncology

In [209]:
run_ttest(data_12_16, 'Oncology', 0, 1)

0    48
1    41
Name: Oncology, dtype: int64


Ttest_indResult(statistic=1.8802044379831484, pvalue=0.06342648342035591)