In [219]:
!ls data

SCH111	SCH31  SCH371  SCH422  SCH4F2  test
SCH1A1	SCH32  SCH421  SCH4F1  SCH4F3


In [62]:
import pandas as pd
from itertools import groupby

from glob import glob

In [198]:
data_files = glob('data/*/*')

In [220]:
test_file, = [d for d in data_files if 'test' in d.lower()]
data_files = [d for d in data_files if 'test' not in d.lower()]

In [241]:
def read_df(file_name, max_t=30, sep=';'):
    header = ['station_id', 'year', 'month', 'day', 'na1', 
              't_min', 'na2', 't_mean', 'na3', 't_max', 'prec1', 'prec2', 'na4', 'na5']
    df = pd.read_csv(file_name, sep=sep, header=None)
    df.columns = header
    
    df = df[df.year >= 1966]

    if sep == ';':
        df.t_min = df.t_min.astype('str').str.strip()
        df.t_mean = df.t_mean.astype('str').str.strip()
        df.t_max =  df.t_max.astype('str').str.strip()
        df = df[df.t_min != '']
        df = df[df.t_max != '']

    df.t_min = df.t_min.astype('float32')
    df.t_max = df.t_max.astype('float32')
    
    df.year = df.year.astype('uint16')
    df.month = df.month.astype('uint16')
    df.day = df.day.astype('uint16')

    df.loc[df.t_max > max_t, 'max_t'] = max_t
    df.loc[df.t_min > max_t, 't_min'] = max_t
    
    return df[['year', 'month', 'day', 't_min', 't_max']].reset_index(drop=1)

In [63]:
def groupby_boolean(bool_series):
    groups = groupby(bool_series, key=lambda x: x)

    cnt = 0
    group_ids = []

    for val, group in groups:
        l = len(list(group))
        group_ids.extend([cnt] * l)
        cnt = cnt + 1

    return group_ids

In [44]:
thresolds = [0, 5, 10, 15]
thresold = thresolds[0]

In [49]:
def join_multicolumns(idx):
    top = idx.levels[0]
    top_vals = top[idx.labels[0]].values

    sec = idx.levels[1]
    sec_vals = sec[idx.labels[1]].values

    return list(top_vals + '_' + sec_vals)

def last(series):
    return series.iloc[-1]

def first(series):
    return series.iloc[0]

In [216]:
def find_gdd_groups(df, threshold):
    df = df.copy()

    df.t_mean = df.t_mean - threshold
    above_threshold = df.t_mean >= 0
    df['above_threshold'] = above_threshold

    df['group_id'] = groupby_boolean(above_threshold)

    group_sums = df.groupby(['group_id', 'year']) \
                   .agg({'t_mean': 'sum',
                         'month': [first, last]})
    group_sums.columns = join_multicolumns(group_sums.columns)
    group_sums = group_sums.reset_index().set_index('group_id')

    group_next = group_sums.shift(-1)
    ig_spring = (group_sums.month_last <= 6) & \
                (group_sums.t_mean_sum > 0) & \
                (group_sums.t_mean_sum > -group_next.t_mean_sum)

    group_prev = group_sums.shift(1)
    ig_autumn = (group_sums.month_last > 6) & \
                (group_sums.t_mean_sum > 0) & \
                (group_sums.t_mean_sum > -group_prev.t_mean_sum)

    ig_summer = (group_sums.month_first < 6) & \
                (group_sums.month_last > 6) & \
                (group_sums.month_last - group_sums.month_first >= 3) & \
                (group_sums.t_mean_sum > 0)

    ig_spring = ig_spring[ig_spring == True].reset_index()
    ig_autumn = ig_autumn[ig_autumn == True].reset_index()
    ig_summer = ig_summer[ig_summer == True].reset_index()

    interesting_groups = pd.concat([ig_spring, ig_summer, ig_autumn]) \
                       .sort_values('group_id') \
                       .reset_index(drop=1) \
                       .drop(0, axis=1) \
                       .drop_duplicates()

    #return group_sums, interesting_groups
    group_sums_selected = group_sums.loc[interesting_groups.group_id]
    return group_sums_selected

In [224]:
def calculate_all_gdd(df):
    
    df['t_mean'] = (df.t_max + df.t_min) / 2
    df_res = {}
    columns = []

    for t in [0, 5, 10, 15]:
        col = 'threshold_%d' % t
        gdd = find_gdd_groups(df, threshold=t)
        df_res[col] = gdd.groupby('year').t_mean_sum.sum()
        columns.append(col)

    df_res = pd.DataFrame(df_res, columns=columns)
    return df_res.reset_index

In [226]:
def calculate_gdd_add(df, t_add):
    df = df.copy()
    df.t_min = df.t_min + t_add
    df.t_max = df.t_max + t_add
    return calculate_all_gdd(df)

In [230]:
from tqdm import tqdm_notebook as tqdm

In [236]:
!mkdir results

In [245]:
#df = read_df(test_file, sep='\t')

for f in tqdm(data_files):
    if f.endswith('.txt'):
        continue

    df = read_df(f, sep=';')
    base_name = f[5:].replace('/', '_') 
    
    for t_add in [0, 1, 2, 3]:
        df_gdd = calculate_gdd_add(df, t_add)
        df_gdd.to_csv('results/%s_%d.txt' % (base_name, t_add), index=False, sep='\t', )





In [246]:
!zip -r results.zip results/*

  adding: results/SCH1A1_28823_Тукан_0.txt (deflated 67%)
  adding: results/SCH1A1_28823_Тукан_1.txt (deflated 68%)
  adding: results/SCH1A1_28823_Тукан_2.txt (deflated 68%)
  adding: results/SCH1A1_28823_Тукан_3.txt (deflated 68%)
  adding: results/SCH1A1_28825__Стерлитамак_0.txt (deflated 67%)
  adding: results/SCH1A1_28825__Стерлитамак_1.txt (deflated 68%)
  adding: results/SCH1A1_28825__Стерлитамак_2.txt (deflated 69%)
  adding: results/SCH1A1_28825__Стерлитамак_3.txt (deflated 68%)
  adding: results/SCH1A1_28833_Верхнеуральск_0.txt (deflated 67%)
  adding: results/SCH1A1_28833_Верхнеуральск_1.txt (deflated 68%)
  adding: results/SCH1A1_28833_Верхнеуральск_2.txt (deflated 67%)
  adding: results/SCH1A1_28833_Верхнеуральск_3.txt (deflated 69%)
  adding: results/SCH1A1_28900_Самара_0.txt (deflated 68%)
  adding: results/SCH1A1_28900_Самара_1.txt (deflated 68%)
  adding: results/SCH1A1_28900_Самара_2.txt (deflated 68%)
  adding: results/SCH1A1_28900_Самара_3.txt (deflated 68%)
  adding