# от Бондаренко Алексея (bondaleksey@gmail.com)

Цель данного ноутбука провести моделирование распространения болезни по аэропортам и посмотреть некоторые функции библиотеки работы с графами NetworkX


Основные пункты исследования: 

* реализация функции моделирование распространения болезни,
* оценка скорости распространения болезни,
* исследние графа распространения (при p = 0.5),
* заключение.

In [1]:
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt

#technical 
%matplotlib inline
import warnings
import contextlib
import time

# 1. Данные

## 1.1 Описание признаков в наборе данных

**Year** - year of the flight (stored as factor).

**Month** - month of the flight (stored as factor).

**DayOfMonth** - day of the month (1 to 31) (stored as integer).

**DayOfWeek** - day of the week (stored as factor).

**DepTime** - actual departure time (stored as float).

**CRSDepTime** - scheduled departure time (stored as float).

**ArrTime** - actual arrival time (stored as float).

**CRSArrTime** - scheduled arrival time (stored as float).

**UniqueCarrier** - carrier ID (stored as factor).

**FlightNum** - flight number (stored as factor).

**TailNum** - plane's tail number (stored as factor).

**ActualElapsedTime** - actual elapsed time of the flight, in minutes (stored as integer).

**CRSElapsedTime** - scheduled elapsed time of the flight, in minutes (stored as integer).

**AirTime** - airborne time for the flight, in minutes (stored as integer).

**ArrDelay** - arrival delay, in minutes (stored as integer).

**DepDelay** - departure delay, in minutes (stored as integer).

**Origin** - originating airport (stored as factor).

**Dest** - destination airport (stored as factor).

**Distance** - flight distance (stored as integer).

**TaxiIn** - taxi time from wheels down to arrival at the gate, in minutes (stored as integer).

**TaxiOut** - taxi time from departure from the gate to wheels up, in minutes (stored as integer).

**Cancelled** - cancellation status (stored as logical).

**CancellationCode** - cancellation code, if applicable (stored as factor).

**Diverted** - diversion status (stored as logical).

**CarrierDelay** - delay, in minutes, attributable to the carrier (stored integer).

**WeatherDelay** - delay, in minutes, attributable to weather factors (stored as integer).

**NASDelay** - delay, in minutes, attributable to the National Aviation System (stored as integer).

**SecurityDelay** - delay, in minutes, attributable to security factors (stored as integer).

**LateAircraftDelay** - delay, in minutes, attributable to late-arriving aircraft (stored as integer).

## 1.2. Чтение данных 

In [35]:
filename = "../data/airline_dec_2008_50k.csv"
df = pd.read_csv(filename, low_memory=False)
print(df.iloc[:10,:10])
print(df.iloc[:10,10:])

   Unnamed: 0  Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  \
0           1  2008     12           1          1      NaN        1000   
1           2  2008     12           1          1      NaN        1000   
2           3  2008     12           1          1      NaN        1000   
3           4  2008     12           1          1      NaN        1000   
4           5  2008     12           1          1      NaN        1000   
5           6  2008     12           1          1      NaN        1005   
6           7  2008     12           1          1      NaN        1015   
7           8  2008     12           1          1      NaN        1020   
8           9  2008     12           1          1      NaN        1029   
9          10  2008     12           1          1      NaN        1030   

   ArrTime  CRSArrTime UniqueCarrier  
0      NaN        1100            WN  
1      NaN        1110            US  
2      NaN        1125            MQ  
3      NaN        1227       

In [3]:
print(df.shape)

(50000, 30)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         50000 non-null  int64  
 1   Year               50000 non-null  int64  
 2   Month              50000 non-null  int64  
 3   DayofMonth         50000 non-null  int64  
 4   DayOfWeek          50000 non-null  int64  
 5   DepTime            32968 non-null  float64
 6   CRSDepTime         50000 non-null  int64  
 7   ArrTime            31418 non-null  float64
 8   CRSArrTime         50000 non-null  int64  
 9   UniqueCarrier      50000 non-null  object 
 10  FlightNum          50000 non-null  int64  
 11  TailNum            44556 non-null  object 
 12  ActualElapsedTime  31367 non-null  float64
 13  CRSElapsedTime     50000 non-null  int64  
 14  AirTime            31367 non-null  float64
 15  ArrDelay           31367 non-null  float64
 16  DepDelay           329

In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,...,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,50000.0,50000.0,50000.0,50000.0,50000.0,32968.0,50000.0,31418.0,50000.0,50000.0,...,50000.0,31418.0,32458.0,50000.0,50000.0,4157.0,4157.0,4157.0,4157.0,4157.0
mean,25000.5,2008.0,12.0,11.3509,3.695,752.043102,978.93092,791.598829,1076.86972,2592.85548,...,572.15812,7.166879,16.336866,0.35558,0.01708,12.044503,2.584316,18.813327,0.152273,7.957662
std,14433.901067,0.0,0.0,7.843122,1.889499,364.602722,496.930255,193.315743,495.563167,2116.729997,...,446.647873,5.53938,9.846548,0.478693,0.129571,31.444616,13.186963,21.499273,2.125548,29.794825
min,1.0,2008.0,12.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,...,30.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12500.75,2008.0,12.0,5.0,2.0,612.0,630.0,742.0,805.0,791.0,...,261.0,4.0,10.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
50%,25000.5,2008.0,12.0,10.0,4.0,657.0,735.0,831.0,910.0,2021.0,...,446.0,6.0,14.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0
75%,37500.25,2008.0,12.0,17.0,5.0,750.0,1300.0,914.0,1315.0,4114.0,...,733.0,9.0,20.0,1.0,0.0,14.0,0.0,25.0,0.0,0.0
max,50000.0,2008.0,12.0,31.0,7.0,2400.0,2359.0,2359.0,2359.0,7829.0,...,3303.0,156.0,206.0,1.0,1.0,1092.0,209.0,326.0,96.0,336.0


## 1.3  DataFrame с рейсами фактических перелетов между аэропортами

Если рейс был отменен или если рейс перенаправлен, то информацию о таких перелетах извлечем из рассмотрения при моделировании.
Наиболее простой способ это выделить только те рейсы в которых **ActualElapsedTime** не `NaN`

In [52]:
flights = df[df['ActualElapsedTime'].notna()][["Origin","Dest", "DayofMonth", "CRSDepTime", "CRSArrTime","DepTime", "ArrTime","Cancelled","Diverted"]]
flights.head()

Unnamed: 0,Origin,Dest,DayofMonth,CRSDepTime,CRSArrTime,DepTime,ArrTime,Cancelled,Diverted
17622,SLC,JFK,1,10,629,7.0,659.0,0,0
17648,SLC,JFK,13,5,626,7.0,631.0,0,0
17666,SLC,JFK,14,5,626,3.0,634.0,0,0
17700,SLC,JFK,15,5,626,2.0,602.0,0,0
18122,DFW,TUL,20,2310,5,3.0,58.0,0,0


In [50]:
flights[["Cancelled"]].sum()

Cancelled    0
dtype: int64

In [51]:
flights[["Diverted"]].sum()

Diverted    0
dtype: int64

In [53]:
flights.shape

(31367, 9)

In [54]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31367 entries, 17622 to 49999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Origin      31367 non-null  object 
 1   Dest        31367 non-null  object 
 2   DayofMonth  31367 non-null  int64  
 3   CRSDepTime  31367 non-null  int64  
 4   CRSArrTime  31367 non-null  int64  
 5   DepTime     31367 non-null  float64
 6   ArrTime     31367 non-null  float64
 7   Cancelled   31367 non-null  int64  
 8   Diverted    31367 non-null  int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 2.4+ MB


In [55]:
flights.describe()

Unnamed: 0,DayofMonth,CRSDepTime,CRSArrTime,DepTime,ArrTime,Cancelled,Diverted
count,31367.0,31367.0,31367.0,31367.0,31367.0,31367.0,31367.0
mean,7.209488,739.032231,800.524437,715.190806,791.311984,0.0,0.0
std,4.686065,331.297472,210.604854,314.193024,192.926966,0.0,0.0
min,1.0,5.0,1.0,1.0,1.0,0.0,0.0
25%,3.0,614.0,745.0,610.0,742.0,0.0,0.0
50%,7.0,700.0,835.0,654.0,831.0,0.0,0.0
75%,10.0,740.0,915.0,740.0,914.0,0.0,0.0
max,31.0,2359.0,2359.0,2400.0,2359.0,0.0,0.0
