In [19]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [21]:
train = pd.read_csv('./open/train.csv')
test = pd.read_csv('./open//test.csv')

## EDA

In [22]:
test

Unnamed: 0,ID,first_party,second_party,facts
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa..."
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a..."
...,...,...,...,...
1235,TEST_1235,"Haitian Centers Council, Inc., et al.","Chris Sale, Acting Commissioner, Immigration A...",According to Executive Order No. 12807 signed ...
1236,TEST_1236,Whitman,"American Trucking Associations, Inc.",Section 109(a) of the Clean Air Act (CAA) requ...
1237,TEST_1237,Linda A. Matteo and John J. Madigan,William G. Barr,Linda Matteo and John Madigan created a plan f...
1238,TEST_1238,Washington State Apple Advertising Commission,Hunt,"In 1972, the North Carolina Board of Agricultu..."


In [23]:
train

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [24]:
# 라벨인코딩
first_party_labels = train['first_party'].unique()
second_party_labels = train['second_party'].unique()

print("겹치는 라벨이 있는지 확인:")
print("first_party와 second_party의 라벨 개수:", len(first_party_labels), len(second_party_labels))

intersection = set(first_party_labels) & set(second_party_labels)
if len(intersection) > 0:
    print("겹치는 라벨이 있습니다:", intersection)
    print("겹치는 라벨의 갯수는:", len(intersection))
else:
    print("겹치는 라벨이 없습니다.")

겹치는 라벨이 있는지 확인:
first_party와 second_party의 라벨 개수: 2110 1974
겹치는 라벨이 있습니다: {'California', 'Chevron U.S.A. Inc.', 'Gregory', 'Walker', 'Nevada', 'Federal Trade Commission', 'Campbell', 'David J. Kappos, Under Secretary of Commerce for Intellectual Property and Director, Patent and Trademark Office', 'United States, et al.', 'Ashcroft', 'City and County of San Francisco, California, et al.', 'Republic of Argentina', 'Mitchell', 'Iowa', 'Lee', 'Federal Communications Commission', 'Martin', 'Rhode Island', 'Doug Dretke, Director, Texas Department of Criminal Justice, Correctional Institutions Division', 'Wright', 'United States', 'Maryland', 'Massachusetts', 'Ricky Bell, Warden', 'Idaho', 'Ken L. Salazar, Secretary of the Interior, et al.', 'Jenkins', 'Arkansas', 'Texas', 'Florida', 'New York', 'Lewis', 'Montana', 'INS', 'Garcia', 'Garner', 'Franchise Tax Board of California', 'Tennessee Valley Authority', 'Commonwealth of Virginia', 'Boggs', 'David Patchak', 'Environmental Protection Agenc

In [25]:
intersection_rows = train[train['first_party'].isin(intersection) | train['second_party'].isin(intersection)]

print("겹치는 이름이 포함된 행:")
intersection_rows

겹치는 이름이 포함된 행:


Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
17,TRAIN_0017,Steven Spears,United States,"In 2004, Steven Spears was charged and convict...",1
18,TRAIN_0018,United States,Martinez-Salazar,Abel Martinez-Salazar was charged with a varie...,1
20,TRAIN_0020,Clay,United States,"Board No. 47, Louisville, Kentucky, denied the...",1
...,...,...,...,...,...
2466,TRAIN_2466,Monroe Ace Setser,United States,"On October 1, 2007, Lubbock police officers ar...",0
2471,TRAIN_2471,United States,James X. Bormes,"In October 2000, the United States Treasury De...",1
2472,TRAIN_2472,United States,Cuauhtemoc Gonzalez-Lopez,"Cuauhtemoc Gonzalez-Lopez hired Joseph Low, an...",0
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0


In [26]:
df = intersection_rows
df.to_csv('./submision/submission2.csv', index=False)
print('Done')

Done


## Data Preprocessing

In [27]:
##############
# 라벨인코딩    #
##############
from sklearn.preprocessing import LabelEncoder

# 겹치는 이름을 고려하여 라벨 인코딩을 수행합니다.
label_encoder = LabelEncoder()

# first_party와 second_party 열을 합쳐서 모든 라벨을 추출합니다.
all_labels = np.concatenate([train['first_party'], train['second_party'], test['first_party'], test['second_party']])

# 겹치는 라벨만 추출합니다.
intersecting_labels = list(set(all_labels) & set(intersection))

# 겹치는 라벨을 고려하여 전체 데이터에 대해 라벨 인코딩을 수행합니다.
label_encoder.fit(all_labels)

# 'first_party' 열의 라벨 인코딩
train['first_party_encoded'] = label_encoder.transform(train['first_party'])
test['first_party_encoded'] = label_encoder.transform(test['first_party'])

# 'second_party' 열의 라벨 인코딩
train['second_party_encoded'] = label_encoder.transform(train['second_party'])
test['second_party_encoded'] = label_encoder.transform(test['second_party'])

In [28]:
vectorizer = TfidfVectorizer()
def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])
    
    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return pd.DataFrame(X)

In [29]:
#################################
# Feature selection에 사용할 특성들 #
#################################

# Feature selection에 사용할 특성들
X_train = train[['first_party_encoded', 'second_party_encoded', 'facts']]
y_train = train['first_party_winner']
X_test = test[['first_party_encoded', 'second_party_encoded', 'facts']]

In [30]:
######################
# facts 에 사용할 특성들 #
######################

X_train = get_vector(vectorizer, train, True)
Y_train = train["first_party_winner"]
X_test = get_vector(vectorizer, test, False)

## Define Model & Train

In [31]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression

In [32]:
# 설정
k = 10  # 원하는 fold의 수를 설정합니다.

# 모델 설정
model = LogisticRegression()

# KFold 교차 검증 객체를 생성합니다.
kf = KFold(n_splits=k, shuffle=True, random_state=1)

# 교차 검증을 수행합니다.
scores = cross_val_score(model, X_train, Y_train, cv=kf)

# 교차 검증 점수의 평균을 출력합니다.
print(f"Average cross-validation score: {scores.mean()}")


Average cross-validation score: 0.6529695050280788


In [33]:
model = LogisticRegression()
model.fit(X_train, Y_train)

## Inference & Submission

In [34]:
submit = pd.read_csv('open/sample_submission.csv')
submit


Unnamed: 0,ID,first_party_winner
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0
...,...,...
1235,TEST_1235,0
1236,TEST_1236,0
1237,TEST_1237,0
1238,TEST_1238,0


In [35]:
pred = model.predict(X_test)

In [36]:
submit['first_party_winner'] = pred
submit.to_csv('./submision/submission3.csv', index=False)
print('Done')

Done


# 디버그 존