# Merging all folds test prediction

In [4]:
from collections import Counter
import pandas as pd

In [2]:
df1 = pd.read_csv("submissions/submission_fold_0.csv", dtype=str).rename(columns={"label": "label1"})
df2 = pd.read_csv("submissions/submission_fold_1.csv", dtype=str).rename(columns={"label": "label2"})
df3 = pd.read_csv("submissions/submission_fold_2.csv", dtype=str).rename(columns={"label": "label3"})
df4 = pd.read_csv("submissions/submission_fold_3.csv", dtype=str).rename(columns={"label": "label4"})
df5 = pd.read_csv("submissions/submission_fold_4.csv", dtype=str).rename(columns={"label": "label5"})

In [3]:
df1 = df1.merge(df2)
df1 = df1.merge(df3)
df1 = df1.merge(df4)
df1 = df1.merge(df5)

In [4]:
del df2, df3, df4, df5

In [5]:
df1.shape

(1000, 6)

## Majority Voting
All cases where atleast 3 or more models make same prediction

In [6]:
label_cols = ["label1", "label2", "label3", "label4", "label5"]

In [7]:
df1["num_matches"] = df1[label_cols].apply(lambda x: Counter(x.values).most_common()[0][1], axis=1)
df1["most_frequent"] = df1[label_cols].mode(axis=1)[0]

In [8]:
df1["num_matches"].value_counts()

5    451
3    196
2    162
4    158
1     33
Name: num_matches, dtype: int64

In [9]:
df1.loc[df1["num_matches"]>=3]["most_frequent"].str.len().value_counts()

8    805
Name: most_frequent, dtype: int64

In [10]:
df1.loc[df1["num_matches"]>=3].sample(n=5)

Unnamed: 0,tag,label1,label2,label3,label4,label5,num_matches,most_frequent
544,482,31260381,31260381,31260351,31260381,31266381,3,31260381
941,428,37173066,37173066,37173066,37730686,37173066,4,37173066
682,24,50092871,50092871,50092871,50092871,50092871,5,50092871
766,2,79182781,79182781,79182781,79182781,79182781,5,79182781
761,682,43624735,43629735,43629735,43629731,43629735,3,43629735


In [11]:
df1.loc[df1["num_matches"]==3].sample(n=5)

Unnamed: 0,tag,label1,label2,label3,label4,label5,num_matches,most_frequent
604,880,71563781,71563781,71563481,71563781,71563481,3,71563781
605,362,78482881,78442881,78412881,78442881,78442881,3,78442881
106,437,11582781,11554781,11584771,11584771,11584771,3,11584771
394,624,53820771,53826791,53820771,53820781,53820771,3,53820771
925,859,78378735,78378735,78378735,7837873151,78378731,3,78378735


**These cases look okayish**

In [12]:
df1.loc[df1["num_matches"]<3].sample(n=5)

Unnamed: 0,tag,label1,label2,label3,label4,label5,num_matches,most_frequent
3,779,64988081,64998071,64988004,64988001,64988081,2,64988081
539,638,16786680,16786600,16786660,16786681,16786681,2,16786681
111,993,51586865,51596865,51586865,5159688151,55968659,2,51586865
815,657,9974871,99748311,99748371,99748311,99746311,2,99748311
13,855,44036881,44036861,44036761,44036981,44036981,2,44036981


In [13]:
part1 = df1.loc[df1["num_matches"]>=3]

In [14]:
part2 = df1.loc[df1["num_matches"]<3]

## Analysis

In [15]:
df1["most_frequent"].str.len().value_counts()

8     996
10      3
7       1
Name: most_frequent, dtype: int64

In [16]:
for col in label_cols:
    df1[f"{col}_length"] = df1[col].astype(str).str.len()

In [17]:
for col in label_cols:
    print(col)
    print(df1[f"{col}_length"].value_counts(), end="\n\n")

label1
8     969
10     21
7       8
17      1
11      1
Name: label1_length, dtype: int64

label2
8     994
10      3
7       2
17      1
Name: label2_length, dtype: int64

label3
8     992
10      6
17      2
Name: label3_length, dtype: int64

label4
8     959
10     39
17      1
11      1
Name: label4_length, dtype: int64

label5
8     986
10     10
17      4
Name: label5_length, dtype: int64



## Character wise majority voting

In [18]:
def get_pred(values):
    pred = ""
    # ignore cases where prediction isn't of length 8
    values = [val for val in values if len(val)==8]
    try:
        for idx in range(8):
            pred += Counter([val[idx] for val in values]).most_common(1)[0][0]
    except Exception:
        pass
    return pred 

In [19]:
part2["most_frequent_after"] = part2[label_cols].apply(lambda x: get_pred(x.values.tolist()), axis=1)

In [20]:
part2.loc[part2['most_frequent']==part2['most_frequent_after']].shape

(103, 9)

In [21]:
part2.loc[part2['most_frequent']!=part2['most_frequent_after']].shape

(92, 9)

In [22]:
part2.loc[part2['most_frequent']!=part2['most_frequent_after']].sample(n=10).sort_values(by="tag")

Unnamed: 0,tag,label1,label2,label3,label4,label5,num_matches,most_frequent,most_frequent_after
561,265,62862871,62860871,62862871,62262871,62860871,2,62860871,62862871
615,487,47188774,47188774,47188194,47188194,47188874,2,47188194,47188774
825,610,14281875,1428187515,14281871,14281871,14281875,2,14281871,14281875
719,706,9839371,98393781,98393771,98393711,98893781,1,9839371,98393781
18,774,72288670,72288610,7228861101,72288611,72288671,1,72288610,72288670
668,777,20560771,20560771,20560747,20560781,20560747,2,20560747,20560771
786,871,80769081,8069081,80769031,80069081,88769681,1,80069081,80769081
713,949,72793968,72793861,72793966,72793981,72793981,2,72793981,72793961
782,977,20810741,30810941,20810941,20810981,20810971,1,20810741,20810941
211,990,42520878,42520871,42520818,42520818,42520878,2,42520818,42520878


### Combining both parts

In [23]:
part2["most_frequent"] = part2["most_frequent_after"]

In [24]:
df = part1[["tag", "most_frequent"]].append(part2[["tag", "most_frequent"]])

In [25]:
df = df.rename(columns={"most_frequent": "label"})

In [26]:
df.head()

Unnamed: 0,tag,label
0,667,15217246
1,733,95520653
4,775,16960812
5,304,46164771
7,307,20242781


In [27]:
df["label"].str.len().value_counts()

8    1000
Name: label, dtype: int64

### Submission

In [1]:
import requests
from pprint import pprint

In [2]:
PUBLIC_host = "http://13.234.225.243"
SUBMISSION_URL = PUBLIC_host + ":8080/submit"
LEADERBOARD_URL = PUBLIC_host + ":8080/leaderboard"

In [30]:
sub_json = df.to_json()
data = {
    "username": "aditya",
    "password": "xxxxx",
    "submission": sub_json,
    "tag": "digits_multi_v2",
}

In [31]:
# r = requests.post(SUBMISSION_URL, json=data)
r.text

'{"success":true,"score":{"acc_8":0.698,"acc_7":0.778,"acc_5":0.946}}'

In [5]:
rl = requests.post(LEADERBOARD_URL, json={})
leaderboard = pd.DataFrame(rl.json()["leaderboard"])
leaderboard = leaderboard.sort_values(by=["score_8", "score_7", "score_5"], ascending=False)
leaderboard["submitted_at"] = leaderboard["submitted_at"].apply(lambda x: pd.to_datetime("-".join(x.split("-")[:-2])))

In [6]:
leaderboard.head(10)

Unnamed: 0,score_5,score_7,score_8,submitted_at,tag,username
0,0.918,0.862,0.723,2020-11-07 06:00:00,crnn_v1_orig_test2,raj
1,0.912,0.868,0.72,2020-11-07 08:00:00,crnn_v1_orig_test3,raj
2,0.912,0.868,0.72,2020-11-07 09:00:00,crnn_v1_orig_test4,raj
3,0.946,0.778,0.698,2020-11-07 07:00:00,digits_multi_v2,aditya
4,0.901,0.856,0.696,2020-11-06 14:00:00,crnn_v1_box,raj
5,0.936,0.78,0.692,2020-11-06 21:00:00,digits_multi_v1,aditya
6,0.92,0.791,0.69,2020-11-07 15:00:00,digits_v3,aditya
7,0.918,0.89,0.616,2020-11-06 14:00:00,crnn_v1_orig,raj
8,0.892,0.852,0.595,2020-11-05 09:00:00,raj_test9,raj
9,0.928,0.853,0.576,2020-11-07 09:00:00,crnn_v1_orig_test5,raj
