# Выбор локации для скважины

Допустим, вы работаете в добывающей компании «ГлавРосГосНефть». Нужно решить, где бурить новую скважину.

Вам предоставлены пробы нефти в трёх регионах: в каждом 10 000 месторождений, где измерили качество нефти и объём её запасов. Постройте модель машинного обучения, которая поможет определить регион, где добыча принесёт наибольшую прибыль. Проанализируйте возможную прибыль и риски техникой *Bootstrap.*

Шаги для выбора локации:

- В избранном регионе ищут месторождения, для каждого определяют значения признаков;
- Строят модель и оценивают объём запасов;
- Выбирают месторождения с самым высокими оценками значений. Количество месторождений зависит от бюджета компании и стоимости разработки одной скважины;
- Прибыль равна суммарной прибыли отобранных месторождений.

## Загрузка и подготовка данных

### Импортирт библиотек и глобальных переменных 

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from numpy import sqrt

pd.options.mode.chained_assignment = None

state = np.random.RandomState(2020)

### Загрузка данных

In [4]:
data_0 = pd.read_csv('/Users/artemvishanov/Desktop/yandex_practicum_project/8. Выбор локации для скважины/geo_data_0.csv')
data_1 = pd.read_csv('/Users/artemvishanov/Desktop/yandex_practicum_project/8. Выбор локации для скважины/geo_data_1.csv')
data_2 = pd.read_csv('/Users/artemvishanov/Desktop/yandex_practicum_project/8. Выбор локации для скважины/geo_data_2.csv')

In [5]:
print(f'''##### Data_0: #####
            {data_0}''')
print()
print(f'''##### Data_1: #####
            {data_1}''')
print()
print(f'''##### Data_2: #####
            {data_2}''')
print()

##### Data_0: #####
                      id        f0        f1        f2     product
0      txEyH  0.705745 -0.497823  1.221170  105.280062
1      2acmU  1.334711 -0.340164  4.365080   73.037750
2      409Wp  1.022732  0.151990  1.419926   85.265647
3      iJLyR -0.032172  0.139033  2.978566  168.620776
4      Xdl7t  1.988431  0.155413  4.751769  154.036647
...      ...       ...       ...       ...         ...
99995  DLsed  0.971957  0.370953  6.075346  110.744026
99996  QKivN  1.392429 -0.382606  1.273912  122.346843
99997  3rnvd  1.029585  0.018787 -1.348308   64.375443
99998  7kl59  0.998163 -0.528582  1.583869   74.040764
99999  1CWhH  1.764754 -0.266417  5.722849  149.633246

[100000 rows x 5 columns]

##### Data_1: #####
                      id         f0         f1        f2     product
0      kBEdx -15.001348  -8.276000 -0.005876    3.179103
1      62mP7  14.272088  -3.475083  0.999183   26.953261
2      vyE1P   6.263187  -5.948386  5.001160  134.766305
3      KcrkZ -13.081

In [6]:
print(f'''##### Data_0: #####
            {data_0.info()}''')
print()
print(f'''##### Data_1: #####
            {data_1.info()}''')
print()
print(f'''##### Data_2: #####
            {data_2.info()}''')
print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       100000 non-null  object 
 1   f0       100000 non-null  float64
 2   f1       100000 non-null  float64
 3   f2       100000 non-null  float64
 4   product  100000 non-null  float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB
##### Data_0: #####
            None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       100000 non-null  object 
 1   f0       100000 non-null  float64
 2   f1       100000 non-null  float64
 3   f2       100000 non-null  float64
 4   product  100000 non-null  float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB
##### Data_1: #####
            None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000

In [7]:
print(f'''##### Data_0: #####
            {data_0.describe()}''')
print()
print(f'''##### Data_1: #####
            {data_1.describe()}''')
print()
print(f'''##### Data_2: #####
            {data_2.describe()}''')
print()

##### Data_0: #####
                              f0             f1             f2        product
count  100000.000000  100000.000000  100000.000000  100000.000000
mean        0.500419       0.250143       2.502647      92.500000
std         0.871832       0.504433       3.248248      44.288691
min        -1.408605      -0.848218     -12.088328       0.000000
25%        -0.072580      -0.200881       0.287748      56.497507
50%         0.502360       0.250252       2.515969      91.849972
75%         1.073581       0.700646       4.715088     128.564089
max         2.362331       1.343769      16.003790     185.364347

##### Data_1: #####
                              f0             f1             f2        product
count  100000.000000  100000.000000  100000.000000  100000.000000
mean        1.141296      -4.796579       2.494541      68.825000
std         8.965932       5.119872       1.703572      45.944423
min       -31.609576     -26.358598      -0.018144       0.000000
25%        

In [8]:
data_0.head()

Unnamed: 0,id,f0,f1,f2,product
0,txEyH,0.705745,-0.497823,1.22117,105.280062
1,2acmU,1.334711,-0.340164,4.36508,73.03775
2,409Wp,1.022732,0.15199,1.419926,85.265647
3,iJLyR,-0.032172,0.139033,2.978566,168.620776
4,Xdl7t,1.988431,0.155413,4.751769,154.036647


In [9]:
data_2.head()

Unnamed: 0,id,f0,f1,f2,product
0,fwXo0,-1.146987,0.963328,-0.828965,27.758673
1,WJtFt,0.262778,0.269839,-2.530187,56.069697
2,ovLUW,0.194587,0.289035,-5.586433,62.87191
3,q6cA6,2.23606,-0.55376,0.930038,114.572842
4,WPMUX,-0.515993,1.716266,5.899011,149.600746


In [10]:
data_2.head()

Unnamed: 0,id,f0,f1,f2,product
0,fwXo0,-1.146987,0.963328,-0.828965,27.758673
1,WJtFt,0.262778,0.269839,-2.530187,56.069697
2,ovLUW,0.194587,0.289035,-5.586433,62.87191
3,q6cA6,2.23606,-0.55376,0.930038,114.572842
4,WPMUX,-0.515993,1.716266,5.899011,149.600746


### Проверка дубликатов

In [11]:
print('data_0:', data_0.duplicated().sum())
print('data_1:', data_1.duplicated().sum())
print('data_2:', data_2.duplicated().sum())

data_0: 0
data_1: 0
data_2: 0


**Вывод:**

1. пропущенные значения отсутствуют
2. столбцы названы корректно, переименовывание не требуется
3. столбец "id" - для обучения не нужен
4. features:
    * f0
    * f1
    * f2
5. target:
    * product
6. дубликаты в данных отсутствуют 

## Подготовка данных

In [12]:
features_0 = data_0[['f0', 'f1', 'f2']]
target_0 = data_0[['product']]

In [13]:
features_1 = data_1[['f0', 'f1', 'f2']]
target_1 = data_1[['product']]

In [14]:
features_2 = data_2[['f0', 'f1', 'f2']]
target_2 = data_2[['product']]

**Вывод:**

1. Сформированы наборы features:

    * features_0
    * features_1
    * features_2
    
    
2. Сформированы наборы target:

    * target_0
    * target_1
    * target_2

## Обучение и проверка модели

### Создание обучающих/валидационных наборов

**Функция создания обучающих/валидационных наборов**

In [15]:
def sets_create(features, target):
    features_train, features_valid, target_train, target_valid = train_test_split(features,
                                                                                  target,
                                                                                  train_size=0.75,
                                                                                  random_state=state,
                                                                                  shuffle=False)
    return features_train, features_valid, target_train, target_valid

**Создание обучающих/валидационных наборов для первого региона**

In [16]:
features_0_train, features_0_valid, target_0_train, target_0_valid = sets_create(features=features_0,
                                                                                 target=target_0)
print(f'''##### Feature_0_train shape: #####
            {features_0_train.shape}''')
print()
print(f'''##### Feature_0_valid shape: #####
            {features_0_valid.shape}''')
print()
print(f'''##### Target_0_train shape: #####
            {target_0_train.shape}''')
print()
print(f'''##### Target_0_valid shape: #####
            {target_0_valid.shape}''')

##### Feature_0_train shape: #####
            (75000, 3)

##### Feature_0_valid shape: #####
            (25000, 3)

##### Target_0_train shape: #####
            (75000, 1)

##### Target_0_valid shape: #####
            (25000, 1)


**Создание обучающих/валидационных наборов для второго региона**

In [17]:
features_1_train, features_1_valid, target_1_train, target_1_valid = sets_create(features=features_1,
                                                                                 target=target_1)
print(f'''##### Feature_1_train shape: #####
            {features_1_train.shape}''')
print()
print(f'''##### Feature_1_valid shape: #####
            {features_1_valid.shape}''')
print()
print(f'''##### Target_1_train shape: #####
            {target_1_train.shape}''')
print()
print(f'''##### Target_1_valid shape: #####
            {target_0_valid.shape}''')

##### Feature_1_train shape: #####
            (75000, 3)

##### Feature_1_valid shape: #####
            (25000, 3)

##### Target_1_train shape: #####
            (75000, 1)

##### Target_1_valid shape: #####
            (25000, 1)


**Создание обучающих/валидационных наборов для третьего региона**

In [18]:
features_2_train, features_2_valid, target_2_train, target_2_valid = sets_create(features=features_2,
                                                                                 target=target_2)
print(f'''##### Feature_2_train shape: #####
            {features_2_train.shape}''')
print()
print(f'''##### Feature_2_valid shape: #####
            {features_2_valid.shape}''')
print()
print(f'''##### Target_2_train shape: #####
            {target_2_train.shape}''')
print()
print(f'''##### Target_2_valid shape: #####
            {target_2_valid.shape}''')

##### Feature_2_train shape: #####
            (75000, 3)

##### Feature_2_valid shape: #####
            (25000, 3)

##### Target_2_train shape: #####
            (75000, 1)

##### Target_2_valid shape: #####
            (25000, 1)


**Вывод:**

1. Созданы обучающие наборы:

    * features_0_train
    * target_0_train
    * features_1_train
    * target_1_train
    * features_2_train
    * target_2_train
    
    
2. Созданы валидационные наборы:

    * features_0_valid
    * target_0_valid
    * features_1_valid
    * target_1_valid
    * features_2_valid
    * target_2_valid

### Обучение и проверка моделей

**Функция обучения/предсказания моделей:**

In [19]:
def model_predict(features_train, target_train, features_valid, target_valid):
    scaler = StandardScaler()

    scaler.fit(features_train)

    features_train = scaler.transform(features_train)
    features_valid = scaler.transform(features_valid)

    model = LinearRegression()
    model.fit(features_train, target_train)

    predicted_valid = model.predict(features_valid)
    return model.score(features_valid, target_valid), predicted_valid

**Создание обучающих/валидационных наборов для первого региона**

In [20]:
model_0_score, predicted_valid_0 = model_predict(features_train = features_0_train, 
                                                 target_train = target_0_train, 
                                                 features_valid = features_0_valid, 
                                                 target_valid = target_0_valid)

**Создание обучающих/валидационных наборов для второго региона**

In [21]:
model_1_score, predicted_valid_1 = model_predict(features_train = features_1_train, 
                                                 target_train = target_1_train, 
                                                 features_valid = features_1_valid, 
                                                 target_valid = target_1_valid)

**Создание обучающих/валидационных наборов для третьего региона**

In [22]:
model_2_score, predicted_valid_2 = model_predict(features_train = features_2_train, 
                                                 target_train = target_2_train, 
                                                 features_valid = features_2_valid, 
                                                 target_valid = target_2_valid)

### Результаты предсказаний для всех регионов

In [23]:
target_0_predict = pd.concat([target_0_valid.reset_index(drop=True), pd.DataFrame(predicted_valid_0)], axis=1)
target_1_predict = pd.concat([target_1_valid.reset_index(drop=True), pd.DataFrame(predicted_valid_1)], axis=1)
target_2_predict = pd.concat([target_2_valid.reset_index(drop=True), pd.DataFrame(predicted_valid_2)], axis=1)

target_0_predict = target_0_predict.rename(columns={0:'predicted_product'})
target_1_predict = target_1_predict.rename(columns={0:'predicted_product'})
target_2_predict = target_1_predict.rename(columns={0:'predicted_product'})

In [24]:
model_results = pd.DataFrame({'real_materials_volume'        :[target_0_valid['product'].sum(),
                                                               target_1_valid['product'].sum(),
                                                               target_2_valid['product'].sum()],
                              'predict_material_volume'      : [predicted_valid_0.sum(),
                                                                predicted_valid_1.sum(),
                                                                predicted_valid_2.sum()],
                              'mean_predict_materila_volume' : [predicted_valid_0.mean(),
                                                                predicted_valid_1.mean(),
                                                                predicted_valid_2.mean()],
                              'RMSE'                         : [sqrt(mean_squared_error(target_0_valid, predicted_valid_0)),
                                                                sqrt(mean_squared_error(target_1_valid, predicted_valid_1)),
                                                                sqrt(mean_squared_error(target_2_valid, predicted_valid_2))],
                              'R2-score'                     : [model_0_score,
                                                                model_1_score,
                                                                model_2_score]
}
)

model_results

Unnamed: 0,real_materials_volume,predict_material_volume,mean_predict_materila_volume,RMSE,R2-score
0,2310666.0,2314842.0,92.593681,37.650563,0.273593
1,1723502.0,1723938.0,68.957534,0.894446,0.999622
2,2372809.0,2381887.0,95.275462,40.003681,0.202021


**Вывод:**

1. Правильные ответы и предсказания моделей сохранены в:
    * model_results
    
    
2. Анализ характеристик каждого региона показал:
    * наиболее перспектиные регионы по среднему предсказанному запасу сырья являются 1-й и 3-й регион
    * минимальное значение RMSE получено для 2-го региона
    * максимальное качество(коэффициент детерминации) имеет модель полученная для 2-го региона

## Подготовка к расчёту прибыли

In [25]:
MATERIAL_UNIT_PRICE = 10**3*450                      # доход с единицы сырья(1000 баррелей)
INVESTMENT_SUM = 10**10                             # бюджет на разработку скважин
HOLE_PRICE = INVESTMENT_SUM/200                      # стоимость стротельства одной скважины
HOLE_TRESHOLD = HOLE_PRICE/MATERIAL_UNIT_PRICE       # необходимый запас сырья для безубыточности скважины
REGION_TRESHOLD = INVESTMENT_SUM/MATERIAL_UNIT_PRICE # необходимый запас сырья для безубыточности региона
print(f'''
Порог безубыточности скважины: {HOLE_TRESHOLD} ед. сырья
Порог безубыточности региона:  {REGION_TRESHOLD} ед. сырья''')


Порог безубыточности скважины: 111.11111111111111 ед. сырья
Порог безубыточности региона:  22222.222222222223 ед. сырья


**Вывод:**

Для начала освоения региона необходимо, чтобы запасы сырья в отобранных 200 скважинах были более 22223 ед. сырья. Запасы сырьяя в каждой отобранной скважине должны быть более 112 ед. сырья.

## Расчёт прибыли и рисков 

### Функция расчета прибыли

In [26]:
def material_sum_revenue(target, probabilities, count):
    probs_sorted = probabilities.sort_values(ascending=False)
    selected = target[probs_sorted.index][:count]
    material_sum = selected.sum()
    material_revenue = material_sum*MATERIAL_UNIT_PRICE - INVESTMENT_SUM
    return pd.DataFrame({'material_sum':[material_sum], 'material_revenue':[material_revenue]})

### Функция bootstrap

In [27]:
def bootstrap(target_valid, probabilities_valid, hole_count):
    region_boot = pd.DataFrame()
    for i in range(1000):
        target_subsample = target_valid.sample(n=hole_count, replace=True, random_state=state)
        probs_subsample = probabilities_valid[target_subsample.index]
        region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
    return  region_boot

### Распределение значений запасов сырья и прибыли для регионов

In [28]:
region_0_boot = bootstrap(target_valid=target_0_predict['product'],
                          probabilities_valid=target_0_predict['predicted_product'],
                          hole_count=500)

region_1_boot = bootstrap(target_valid=target_1_predict['product'],
                            probabilities_valid=target_1_predict['predicted_product'],
                            hole_count=500)

region_2_boot = bootstrap(target_valid=target_2_predict['product'],
                            probabilities_valid=target_2_predict['predicted_product'],
                            hole_count=500)

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample, probs_subsample, 200), ignore_index=True)
  region_boot = region_boot.append(material_sum_revenue(target_subsample

In [29]:
region_0_boot

Unnamed: 0,material_sum,material_revenue
0,23360.146756,5.120660e+08
1,22474.088624,1.133399e+08
2,22179.510065,-1.922047e+07
3,21758.135034,-2.088392e+08
4,23968.189502,7.856853e+08
...,...,...
995,24200.388292,8.901747e+08
996,23014.834460,3.566755e+08
997,23820.702656,7.193162e+08
998,22588.769489,1.649463e+08


### Функция расчета доверительного интервала 

In [30]:
def get_confidence_interval(region_boot):
    return (region_boot.material_revenue.quantile(0.025), region_boot.material_revenue.quantile(0.975))

### Расчет доверительного интервала

In [31]:
confidence_interval_region_0 = get_confidence_interval(region_0_boot)
confidence_interval_region_1 = get_confidence_interval(region_1_boot)
confidence_interval_region_2 = get_confidence_interval(region_2_boot)

### Функция расчета риска разработки региона

In [32]:
def risk_percent(region_boot):
    low_treshold_count = len(region_boot.query('material_revenue <= 0'))
    return low_treshold_count/len(region_boot) * 100

### Расчет риска разработки региона

In [33]:
risk_region_0 = risk_percent(region_0_boot)
risk_region_1 = risk_percent(region_1_boot)
risk_region_2 = risk_percent(region_2_boot)

# Формирование итоговой таблицы

In [34]:
region_final = pd.DataFrame({'Средняя прибыль региона, руб.'      : [region_0_boot.material_revenue.mean(),
                                                                     region_1_boot.material_revenue.mean(),
                                                                     region_2_boot.material_revenue.mean()],
                             '95%-й доверительный интервал, руб.' : [confidence_interval_region_0,
                                                                     confidence_interval_region_1,
                                                                     confidence_interval_region_2],
                             'Вероятность убытков, %'             : [risk_region_0,
                                                                     risk_region_1,
                                                                     risk_region_2]}, index=['Регион 1','Регион 2','Регион 3'])
region_final


Unnamed: 0,"Средняя прибыль региона, руб.","95%-й доверительный интервал, руб.","Вероятность убытков, %"
Регион 1,435895300.0,"(-105951189.97876148, 936297443.5804422)",5.0
Регион 2,541762500.0,"(120267392.5062757, 998014332.9608271)",0.4
Регион 3,533798300.0,"(100808174.70538989, 968727541.5222876)",1.1


# Вывод:

**Для разработки рекомендуется "Регион 2", по следующим причинам:**
   * минимальная вероятность убытков ~ 0.4%
   * максимальная средняя прибыль из предложенных для анализа регионов