In [2]:
import vis
import utils
import train
import joblib
import pandas as pd
import geopandas as gpd
from train import get_processed_data_for_country
from sklearn.metrics import classification_report
from vis  import main_gdf_with_target_over_images
import matplotlib.pyplot as plt

def ids_country_analysis(ids):
    test_gdf = utils.read_test_data()
    test_gdf = test_gdf[test_gdf['ID'].isin(ids)]
    test_gdf = test_gdf.reset_index(drop=True)
    print(test_gdf['Country'].value_counts())
    return test_gdf

def compare_with_latest_best_targets(df, latest_best_file_path='./latest_best.csv', country=None):
    latest_best_df = pd.read_csv(latest_best_file_path)
    if country is not None:
        latest_best_df = add_country_to_file(latest_best_df)
        latest_best_df = latest_best_df[latest_best_df['Country'] == country]
    print(f'Shape of latest best: {latest_best_df.shape}')
    print(f'Shape of df: {df.shape}')
    latest_best_df = latest_best_df.sort_values(by='ID')
    df = df.sort_values(by='ID')
    latest_best_df = latest_best_df.reset_index(drop=True)
    df = df.reset_index(drop=True)
    df_ids = df['ID'].values
    latest_best_df_ids = latest_best_df['ID'].values
    for i in range(len(df_ids)):
        if df_ids[i] != latest_best_df_ids[i]:
            print('error')
            return
    print('Same IDs')
    traget_diff_count = 0
    diff_ids = []
    for i in range(len(df_ids)):
        if df['Target'].values[i] != latest_best_df['Target'].values[i]:
            traget_diff_count += 1
            diff_ids.append(df_ids[i])
    print('IDs with different targets Country analysis:')
    ids_country_analysis(diff_ids)
    print()

    print('Number of changed targets:')
    print(traget_diff_count)
    print()

    print('Expected Change in accuracy:')
    print(f'+/- {traget_diff_count/len(df_ids)*100}%')
    print()

    print('Current targets value counts:')
    print(df['Target'].value_counts())
    print()

    print('Latest best targets value counts:')
    print(latest_best_df['Target'].value_counts())
    print()
    print(f'Target diff count: {traget_diff_count}')
    print(f'Target diff %: {traget_diff_count/len(df_ids)}')
    return diff_ids

def copy_targets_from_latest_best(df, latest_best_file_path='./latest_best.csv', country=None):
    latest_best_df = pd.read_csv(latest_best_file_path)
    if country is not None:
        latest_best_df = add_country_to_file(latest_best_df)
        latest_best_df = latest_best_df[latest_best_df['Country'] == country]
    latest_best_df = latest_best_df.sort_values(by='ID')
    df = df.sort_values(by='ID')
    latest_best_df = latest_best_df.reset_index(drop=True)
    df = df.reset_index(drop=True)
    df_ids = df['ID'].values
    latest_best_df_ids = latest_best_df['ID'].values
    for i in range(len(df_ids)):
        if df_ids[i] != latest_best_df_ids[i]:
            print('error')
            return
    print('Same IDs')
    df['Target'] = latest_best_df['Target']
    return df

def add_country_to_file(df):
    test_gdf = utils.read_test_data()
    test_gdf = test_gdf.sort_values(by='ID')
    test_gdf = test_gdf.reset_index(drop=True)
    df = df.sort_values(by='ID')
    df = df.reset_index(drop=True)
    df['Country'] = test_gdf['Country']
    return df

def main(direction, dates, country, train_gdf, test_gdf):
    classifer_key = 'RF'
    split = 'train'
    locations = [f'{country}_{direction}_{split}']
    true_label = True
    if split == 'train':
        gdf = train_gdf.copy()
    if split == 'test':
        gdf = test_gdf.copy()
    classifier_path = f'{country}_{classifer_key}_classifier.joblib'
    clf = joblib.load(classifier_path)
    feature_names = clf.feature_names_in_
    X = gdf[feature_names]
    y = gdf['Target']
    preds = clf.predict(X)
    print(classification_report(y, preds))
    if true_label:
        gdf['Target'] = y
    else:
        gdf['Target'] = preds
    fig_name = f'./map_pdfs/{country}_{locations[0]}_{true_label}_{split}_{dates[0]}.pdf'
    d_train = main_gdf_with_target_over_images(gdf, dates, locations, fig_name=fig_name)
    #====================================================================================================
    #====================================================================================================
    #====================================================================================================
    #====================================================================================================
    split = 'test'
    locations = [f'{country}_{direction}_{split}']
    true_label = False
    if split == 'train':
        gdf = train_gdf.copy()
    if split == 'test':
        gdf = test_gdf.copy()
    classifier_path = f'{country}_{classifer_key}_classifier.joblib'
    clf = joblib.load(classifier_path)
    feature_names = clf.feature_names_in_
    X = gdf[feature_names]
    y = gdf['Target']
    preds = clf.predict(X)
    print(classification_report(y, preds))
    if true_label:
        gdf['Target'] = y
    else:
        gdf['Target'] = preds
    fig_name = f'./map_pdfs/{country}_{locations[0]}_{true_label}_{split}_{dates[0]}.pdf'
    d_test = main_gdf_with_target_over_images(gdf, dates, locations, fig_name=fig_name)
    #====================================================================================================
    #====================================================================================================
    #====================================================================================================
    #====================================================================================================
    split = 'test'
    locations = [f'{country}_{direction}_{split}']
    true_label = False
    if split == 'train':
        gdf = train_gdf.copy()
    if split == 'test':
        gdf = test_gdf.copy()
    classifier_path = f'{country}_{classifer_key}_classifier.joblib'
    clf = joblib.load(classifier_path)
    feature_names = clf.feature_names_in_
    X = gdf[feature_names]
    y = gdf['Target']
    preds = clf.predict(X)
    print(classification_report(y, preds))
    compare_with_latest_best_targets(gdf, country=country)
    gdf = copy_targets_from_latest_best(gdf, country=country)
    compare_with_latest_best_targets(gdf, country=country)
    fig_name = f'./map_pdfs/{country}_{locations[0]}_{true_label}_{split}_{dates[0]}_latest_best.pdf'
    d_best = main_gdf_with_target_over_images(gdf, dates, locations, fig_name=fig_name)
    return d_train, d_test, d_best

def iran_scatter_map():
    directions = ['North_West', 'North_East', 'South_West']
    dates = ['2019-11-22']
    country = 'Iran'
    train_gdf, test_gdf, train_original, test_original = get_processed_data_for_country(country)
    for direction in directions:
        main(direction, dates, country, train_gdf, test_gdf)

def sudan_scatter_map():
    directions = ['North_West', 'North_East', 'South_West']
    dates = ['2019-10-30']
    country = 'Sudan'
    train_gdf, test_gdf, train_original, test_original = get_processed_data_for_country(country)
    for direction in directions:
        main(direction, dates, country, train_gdf, test_gdf)

def afghanistan_scatter_map():
    directions = ['North_West', 'South_East']
    dates = ['2022-04-03']
    country = 'Afghanistan'
    train_gdf, test_gdf, train_original, test_original = get_processed_data_for_country(country)
    for direction in directions:
        main(direction, dates, country, train_gdf, test_gdf)

def get_processed_data_for_country(country):
    porcessed_data_dict = train.get_porcessed_data_dict()
    train_gdf = porcessed_data_dict[country]['train']
    test_gdf = porcessed_data_dict[country]['test']
    train_original, test_original = train.get_orginal_train_test(country)
    print(f'train_gdf: {train_gdf.shape}, test_gdf: {test_gdf.shape}')
    print(f'train_original: {train_original.shape}, test_original: {test_original.shape}')
    train_gdf, test_gdf = train.train_and_test_gdfs_matched_to_original_train_test(train_gdf, test_gdf, train_original, test_original, country)
    print(f'train_gdf: {train_gdf.shape}, test_gdf: {test_gdf.shape}')
    #drop cols with NaNs
    train_gdf = train_gdf.dropna(axis=1)
    test_gdf = test_gdf.dropna(axis=1)
    train_gdf = train_gdf.reset_index(drop=True)
    test_gdf = test_gdf.reset_index(drop=True)
    return train_gdf, test_gdf, train_original, test_original


def scatter_gdf(gdf, tiffs, key, fig_name='scatter_gdf.png', write_ids=False):
    print('Generating scatter positions...')
    for key in tiffs:
        x_inds, y_inds, targets, ids_matched = vis.targets_scatter_positions(gdf, tiffs[key]['x_axis'], tiffs[key]['y_axis'])
        print(f'Image {key} has {len(targets)} matched targets out of {len(gdf)}')
        if len(targets) == 0:
            return None
        tiffs[key]['x_inds'] = x_inds
        tiffs[key]['y_inds'] = y_inds
        tiffs[key]['targets'] = targets
        tiffs[key]['ids_matched'] = ids_matched
    for key in tiffs:
        tiffs[key] = vis.split_target_scatter_dict_to_pos_neg(tiffs[key], positive_target=1, negative_target=0)
    tiffs[key].keys()
    x_inds_pos = tiffs[key]['x_inds_positive']
    y_inds_pos = tiffs[key]['y_inds_positive']
    x_inds_neg = tiffs[key]['x_inds_negative']
    y_inds_neg = tiffs[key]['y_inds_negative']
    ids_matched_pos = tiffs[key]['id_matched_positive']
    ids_matched_neg = tiffs[key]['id_matched_negative']
    fig, ax = plt.subplots(figsize=(80,80))
    ax.imshow(vis.fix_image(tiffs[key]['true_color_arr']))
    ax.scatter(x_inds_pos, y_inds_pos, s=100, label='Positive', color = 'red')
    for i in range(len(x_inds_pos)):
        if write_ids:
            ax.text(x_inds_pos[i], y_inds_pos[i], ids_matched_pos[i] , fontsize=30, color='red')
        else:
            ax.text(x_inds_pos[i], y_inds_pos[i], i , fontsize=30, color='red')
    ax.scatter(x_inds_neg, y_inds_neg, s=100, label='Negative', color = 'blue')
    for i in range(len(x_inds_neg)):
        if write_ids:
            ax.text(x_inds_neg[i], y_inds_neg[i], ids_matched_neg[i] , fontsize=30, color='blue')
        else:
            ax.text(x_inds_neg[i], y_inds_neg[i], i , fontsize=30, color='blue')
    ax.legend()
    ax.set_title(f'{key} Scatter')
    ax.grid(False)
    fig.savefig(fig_name)
    plt.show()
    return tiffs[key]




In [4]:
x = pd.read_csv('./Afghanistan_low_cloud_tall_submession.csv')
ids = compare_with_latest_best_targets(x, country='Afghanistan')
best_sub = pd.read_csv('./latest_best.csv')
best_sub_ids = best_sub['ID'].values
best_sub_targets = best_sub['Target'].values
x_ids = x['ID'].values.tolist()
x_targets = x['Target'].values.tolist()

Shape of latest best: (500, 3)
Shape of df: (500, 5)
Same IDs
IDs with different targets Country analysis:
Afghanistan    53
Name: Country, dtype: int64

Number of changed targets:
53

Expected Change in accuracy:
+/- 10.6%

Current targets value counts:
0    262
1    238
Name: Target, dtype: int64

Latest best targets value counts:
1    259
0    241
Name: Target, dtype: int64

Target diff count: 53
Target diff %: 0.106


  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [23]:
new_targets = []
for i in range(len(best_sub_ids)):
    current_best_id = best_sub_ids[i]
    current_best_target = best_sub_targets[i]
    if current_best_id in ids:
        current_best_id_index_in_x = x_ids.index(current_best_id)
        current_x_id = x_ids[current_best_id_index_in_x]
        current_x_target = x_targets[current_best_id_index_in_x]
        if current_x_id == current_best_id:
            new_targets.append(current_x_target)
        else:
            print('error')
            break
    else:
        new_targets.append(current_best_target)

In [25]:
best_sub['Target'] = new_targets
ids_changed = compare_with_latest_best_targets(best_sub)
best_sub.to_csv(f'./latest_best_{len(ids_changed)}_changed.csv', index=False)

Shape of latest best: (1500, 2)
Shape of df: (1500, 2)
Same IDs
IDs with different targets Country analysis:
Afghanistan    52
Name: Country, dtype: int64

Number of changed targets:
52

Expected Change in accuracy:
+/- 3.4666666666666663%

Current targets value counts:
1    764
0    736
Name: Target, dtype: int64

Latest best targets value counts:
1    792
0    708
Name: Target, dtype: int64

Target diff count: 52
Target diff %: 0.034666666666666665


  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [None]:
country = 'Sudan'
country = 'Afghanistan'
country = 'Iran'
train_gdf, test_gdf, train_original, test_original = get_processed_data_for_country(country)

In [None]:
clf = joblib.load(f'{country}_VOTE_classifier.joblib')
drop_cols = ['Location', 'geometry', 'Lat', 'Lon', 'ID', 'Target']
X = test_gdf.drop(drop_cols, axis=1)
preds = clf.predict(X)
test_gdf['Target'] = preds
submession_df = test_gdf[['ID', 'Target']]
diff_ids = compare_with_latest_best_targets(submession_df, country=country)
submession_df_ids = submession_df['ID'].values
submession_df_targets = submession_df['Target'].values
dict_diffs = {
    'ID': [],
    'Target': [],
}
for i in range(len(submession_df_ids)):
    current_id = submession_df_ids[i]
    if current_id in diff_ids:
        print(f'ID: {current_id}, Target: {submession_df_targets[i]}')
        dict_diffs['ID'].append(current_id)
        dict_diffs['Target'].append(submession_df_targets[i])
diffs_df = pd.DataFrame(dict_diffs)
diffs_df_vote = diffs_df.copy()

In [None]:
clf = joblib.load(f'{country}_STACK_classifier.joblib')
drop_cols = ['Location', 'geometry', 'Lat', 'Lon', 'ID', 'Target']
X = test_gdf.drop(drop_cols, axis=1)
preds = clf.predict(X)
test_gdf['Target'] = preds
submession_df = test_gdf[['ID', 'Target']]
diff_ids = compare_with_latest_best_targets(submession_df, country=country)
submession_df_ids = submession_df['ID'].values
submession_df_targets = submession_df['Target'].values
submession_df_preds_prob = clf.predict_proba(X)
dict_diffs = {
    'ID': [],
    'Target': [],
    'Preds': [],
    'confidance': [],
}
for i in range(len(submession_df_ids)):
    current_id = submession_df_ids[i]
    if current_id in diff_ids:
        print(f'ID: {current_id}, Target: {submession_df_targets[i]}, Preds: {submession_df_preds_prob[i]}')
        dict_diffs['ID'].append(current_id)
        dict_diffs['Target'].append(submession_df_targets[i])
        dict_diffs['Preds'].append(submession_df_preds_prob[i])
        conf = abs(submession_df_preds_prob[i][0] - submession_df_preds_prob[i][1])
        dict_diffs['confidance'].append(conf)
diffs_df = pd.DataFrame(dict_diffs)
diffs_df = diffs_df.sort_values(by='confidance', ascending=False)
# diffs_df = diffs_df.reset_index(drop=True)
# diffs_df = diffs_df[diffs_df['confidance'] > 0.1]
# diffs_df = diffs_df.reset_index(drop=True)
# diff_ids = diffs_df['ID'].values
diffs_df_stack = diffs_df.copy()

In [None]:
import numpy as np
diffs_df_stack.shape, diffs_df_vote.shape
stack_ids = diffs_df_stack['ID'].values
vote_ids = diffs_df_vote['ID'].values
votes_targets = diffs_df_vote['Target'].values
votes_to_stack_targets = []
for i in range(len(stack_ids)):
    current_stack_id = stack_ids[i]
    if current_stack_id in vote_ids:
        vote_index = np.where(vote_ids == current_stack_id)[0][0]
        current_vote_target = votes_targets[vote_index]
    else:
        current_vote_target = -1
    votes_to_stack_targets.append(current_vote_target)
diffs_df_stack['vote_target'] = votes_to_stack_targets
diffs_df_stack['Same_Target'] = diffs_df_stack['Target'] == diffs_df_stack['vote_target']
diffs_df_stack_same = diffs_df_stack[diffs_df_stack['Same_Target'] == True]
diffs_df_stack_same = diffs_df_stack_same[diffs_df_stack_same['confidance'] > 0.4]
diffs_df_stack_same = diffs_df_stack_same.reset_index(drop=True)
diffs_df_stack_same = diffs_df_stack_same.iloc[:3]

In [None]:
diffs_df_stack_same

In [None]:
directions = ['North_West', 'North_East', 'South_East', 'South_West']
# dates = ['2022-04-03']
# country = 'Afghanistan'
# dates = ['2019-11-22']
# country = 'Iran'
dates = ['2019-10-30']
country = 'Sudan'
splits = ['train', 'test']
for i in range(4):
    locations = [f'{country}_{directions[i]}_{splits[1]}']
    tiffs = {}
    dates = list(set(dates))
    locations = list(set(locations))
    print(f'Number of dates: {len(dates)}')
    print(f'Number of locations: {len(locations)}')
    print(f'Total number of images: {len(dates) * len(locations)}')
    print()

    print('loading tiffs...')
    for date in dates:
        for location in locations:
            key = f'{date}_{location}_ALL_tiff'
            tiffs[key] = {}
            tiffs[key]['gdf_all'] = vis.get_tiff(date, location)

    print('Generating image arrays...')
    for key in tiffs:
        true_color_arr, x_axis, y_axis = vis.true_color_df_to_image_array(tiffs[key]['gdf_all'])
        tiffs[key]['true_color_arr'] = true_color_arr
        tiffs[key]['x_axis'] = x_axis
        tiffs[key]['y_axis'] = y_axis

    gdf = test_gdf[['Location', 'geometry', 'Lat', 'Lon', 'ID', 'Target']]
    gdf = gdf[gdf['ID'].isin(diff_ids)]
    gdf = gdf.reset_index(drop=True)
    xx = scatter_gdf(gdf, tiffs, key, fig_name=f'ids_diff_scatter_gdf_{locations[0]}_{country}.pdf', write_ids=True)

In [None]:
diffs_df
no_skip_indices = [3,4,5,9,11,14,15,16]
diffs_df_no_skip = diffs_df[diffs_df.index.isin(no_skip_indices)]
diffs_df_no_skip = diffs_df_no_skip.reset_index(drop=True)
diffs_df_no_skip

In [None]:
country = 'Afghanistan'
train_gdf, test_gdf, train_original, test_original = get_processed_data_for_country(country)
clf = joblib.load(f'{country}_VOTE_classifier.joblib')
drop_cols = ['Location', 'geometry', 'Lat', 'Lon', 'ID', 'Target']
X = test_gdf.drop(drop_cols, axis=1)
preds = clf.predict(X)
test_gdf['Target'] = preds
afghanistan_submession_df = test_gdf[['ID', 'Target']]

In [None]:
country = 'Sudan'
train_gdf, test_gdf, train_original, test_original = get_processed_data_for_country(country)
clf = joblib.load(f'{country}_RF_classifier.joblib')
drop_cols = ['Location', 'geometry', 'Lat', 'Lon', 'ID', 'Target']
X = test_gdf.drop(drop_cols, axis=1)
preds = clf.predict(X)
test_gdf['Target'] = preds
sudan_submession_df = test_gdf[['ID', 'Target']]

In [None]:
country = 'Iran'
train_gdf, test_gdf, train_original, test_original = get_processed_data_for_country(country)
clf = joblib.load(f'{country}_VOTE_classifier.joblib')
drop_cols = ['Location', 'geometry', 'Lat', 'Lon', 'ID', 'Target']
X = test_gdf.drop(drop_cols, axis=1)
preds = clf.predict(X)
test_gdf['Target'] = preds
iran_submession_df = test_gdf[['ID', 'Target']]

In [None]:
all_submession_df = pd.concat([afghanistan_submession_df, sudan_submession_df, iran_submession_df])
all_submession_df = all_submession_df.sort_values(by='ID')
all_submession_df = all_submession_df.reset_index(drop=True)

In [None]:
diff_ids = compare_with_latest_best_targets(all_submession_df)
all_submession_df = copy_targets_from_latest_best(all_submession_df)
diff_ids = compare_with_latest_best_targets(all_submession_df)

In [None]:
country_to_keep = 'Afghanistan'
country = country_to_keep
if country_to_keep == 'Afghanistan':
    all_submession_df_to_keep = afghanistan_submession_df.copy()
if country_to_keep == 'Sudan':
    all_submession_df_to_keep = sudan_submession_df.copy()
if country_to_keep == 'Iran':
    all_submession_df_to_keep = iran_submession_df.copy()
    
ids_to_keep = all_submession_df_to_keep['ID'].values
targets_to_keep = all_submession_df_to_keep['Target'].values
for i in range(len(all_submession_df)):
    if all_submession_df['ID'].values[i] in ids_to_keep:
        all_submession_df['Target'].values[i] = targets_to_keep[ids_to_keep == all_submession_df['ID'].values[i]][0]
all_submession_df = all_submession_df.sort_values(by='ID')
all_submession_df = all_submession_df.reset_index(drop=True)
_ = compare_with_latest_best_targets(all_submession_df)

In [None]:
diffs_df_zero = diffs_df[diffs_df['Target'] == 0]
diffs_df_zero = diffs_df_zero.reset_index(drop=True)
diffs_df_zero

In [None]:
# ID: ID_4SZZXFSYB4ME, Target: 0, Preds: [0.61176471 0.38823529]
# ID: ID_22EGMNC65QJL, Target: 1, Preds: [0.41764706 0.58235294]
# ID: ID_QGXR12BWLQ5G, Target: 1, Preds: [0.42941176 0.57058824]
# ID: ID_6YVPRDP1DI46, Target: 1, Preds: [0.45882353 0.54117647]
# ID: ID_GTXHJEGD2Y13, Target: 0, Preds: [0.50588235 0.49411765]
# ID: ID_BV2M6HQWL7FJ, Target: 0, Preds: [0.51176471 0.48823529]
# ID: ID_HWT4U5THIJYN, Target: 0, Preds: [0.51176471 0.48823529]

In [None]:
# ID: ID_XGIY5UACMRGZ, Target: 1, Preds: [0.48666667 0.51333333]
# ID: ID_9RMSW504CUDX, Target: 1, Preds: [0.48666667 0.51333333]
# ID: ID_5DJLPY4K50GK, Target: 1, Preds: [0.42666667 0.57333333]
# ID: ID_QGXR12BWLQ5G, Target: 1, Preds: [0.47 0.53]
# ID: ID_4SZZXFSYB4ME, Target: 0, Preds: [0.51333333 0.48666667]
# ID: ID_22EGMNC65QJL, Target: 1, Preds: [0.49333333 0.50666667]
# ID: ID_HR9VZG6INART, Target: 1, Preds: [0.49333333 0.50666667]
# ID: ID_PO650RIMWGD2, Target: 1, Preds: [0.48666667 0.51333333]
# ID: ID_94KD2KB6O746, Target: 1, Preds: [0.49333333 0.50666667]
# ids_changing =[
#     # ('ID_5DJLPY4K50GK', 1),
#     # ('ID_QGXR12BWLQ5G', 1),
#     ('ID_59HVM5DLXFZU', 1),
#     ('ID_0MWHKSKMJB72', 1),
#     ('ID_1PP3PY0K6FTE', 0),
#     # ('ID_0TI5KKL5OJ19', 1),
# ]
ids_changing = [(row.ID, row.Target) for row in diffs_df_stack_same.itertuples()]
for id_to_change, target in ids_changing:
    all_submession_df['Target'].values[all_submession_df['ID'].values == id_to_change] = target

In [None]:
diffs_df_stack_same

In [None]:
x=compare_with_latest_best_targets(all_submession_df)
print(x)

In [None]:
country = 'Iran'
fp = f'./{country}_from_file_rest_from_latest_best_with_stack_and_vote_best_combined.csv'
all_submession_df.to_csv(fp, index=False)
print('Done')

In [None]:
d = pd.read_csv(fp)
x= compare_with_latest_best_targets(d)
print(x)

In [None]:
722/778

In [None]:
score_submession_dict = {
    0.157777777: 'submession_afghanistan_df_zeros',
    0.164444444: 'submession_afghanistan_df_ones',
    0.126666666: 'submession_iran_df_zeros',
    0.162222222: 'submession_iran_df_ones',
    0.195555555: 'submession_sudan_df_zeros',
    0.193333333: 'submession_sudan_df_ones',
}
ones_scores = [0.164444444, 0.162222222, 0.193333333]
zeros_scores = [0.157777777, 0.126666666, 0.195555555]
print(sum(zeros_scores)/sum(ones_scores))

In [None]:
s1 =pd.read_csv('s1.csv')
s2 =pd.read_csv('s2.csv')
s3 =pd.read_csv('s3.csv')

s1 = s1.sort_values(by='ID')
s2 = s2.sort_values(by='ID')
s3 = s3.sort_values(by='ID')
s1 = s1.reset_index(drop=True)
s2 = s2.reset_index(drop=True)
s3 = s3.reset_index(drop=True)
s1_ids = s1['ID'].values
s2_ids = s2['ID'].values
s3_ids = s3['ID'].values
for i in range(len(s1_ids)):
    if s1_ids[i] != s2_ids[i]:
        print('error')
    if s1_ids[i] != s3_ids[i]:
        print('error')
for i in range(len(s1_ids)):
    # if s1['Target'].values[i] != s2['Target'].values[i]:
    #     print(s1_ids[i])
    if s1['Target'].values[i] != s3['Target'].values[i]:
        print(s1_ids[i])

In [None]:
all_submession_df_ids = all_submession_df['ID'].values
all_submession_df_targets = all_submession_df['Target'].values
i = 0
id_t_zero = []
for id in all_submession_df_ids:
    t = all_submession_df_targets[i]
    if id in id_diff:
        if t ==0:
            print(f'{id},{t}')
            i += 1
            id_t_zero.append(1)
            continue
    i += 1
    id_t_zero.append(t)

In [None]:
all_submession_df['Target'] = id_t_zero
compare_with_latest_best_targets(all_submession_df)

In [None]:
all_submession_df.to_csv(f'./{country}_from_file_rest_from_latest_best_small_rf_1_changes_only.csv', index=False)


In [None]:
compare_with_latest_best_targets(all_submession_df)

In [None]:
tiff_ids = ['ID_0MWHKSKMJB72',
 'ID_0TI5KKL5OJ19',
 'ID_1PP3PY0K6FTE',
 'ID_59HVM5DLXFZU',
 'ID_5P8D8KE1YCLH',
 'ID_6FUVGW0COA03',
 'ID_926RG17914DH',
 'ID_9CL5EE0GEOKN',
 'ID_9IDPXWOQC9DX',
 'ID_A8SSS8PDJZQO',
 'ID_AG5A6YN9F6SF',
 'ID_CBUDQFSHMI77',
 'ID_EVJ7M04NWFUU',
 'ID_GWVGO1IHSXX3',
 'ID_IW5CYZC97K0U',
 'ID_JFY4EH7GCEDQ',
 'ID_NGRNX0RH7BC7',
 'ID_OIYR2OMLBJXZ',
 'ID_POTFEDHK62PL',
 'ID_QRO4BP4LU2HG',
 'ID_T4XQQ1S50ZB8',
 'ID_VSOJWXF30ZRW']

In [None]:
path1 = f'./{country}_from_file_rest_from_latest_best_small_rf_1_changes_only.csv'
path2 = f'./{country}_from_file_rest_from_latest_best_small_rf_0_changes_only.csv'
path_best = './latest_best.csv'
df1 = pd.read_csv(f'./{country}_from_file_rest_from_latest_best_small_rf_1_changes_only.csv')
df2 = pd.read_csv(f'./{country}_from_file_rest_from_latest_best_small_rf_0_changes_only.csv')
df_best = pd.read_csv('./latest_best.csv')

In [None]:
compare_with_latest_best_targets(df1, path_best)