In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
train = pd.read_csv('./open/train.csv')
test = pd.read_csv('./open//test.csv')

## EDA

In [4]:
train

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [5]:
test

Unnamed: 0,ID,first_party,second_party,facts
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa..."
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a..."
...,...,...,...,...
1235,TEST_1235,"Haitian Centers Council, Inc., et al.","Chris Sale, Acting Commissioner, Immigration A...",According to Executive Order No. 12807 signed ...
1236,TEST_1236,Whitman,"American Trucking Associations, Inc.",Section 109(a) of the Clean Air Act (CAA) requ...
1237,TEST_1237,Linda A. Matteo and John J. Madigan,William G. Barr,Linda Matteo and John Madigan created a plan f...
1238,TEST_1238,Washington State Apple Advertising Commission,Hunt,"In 1972, the North Carolina Board of Agricultu..."


In [6]:
# 라벨인코딩
first_party_labels = train['first_party'].unique()
second_party_labels = train['second_party'].unique()

print("겹치는 라벨이 있는지 확인:")
print("first_party와 second_party의 라벨 개수:", len(first_party_labels), len(second_party_labels))

intersection = set(first_party_labels) & set(second_party_labels)
if len(intersection) > 0:
    print("겹치는 라벨이 있습니다:", intersection)
    print("겹치는 라벨의 갯수는:", len(intersection))
else:
    print("겹치는 라벨이 없습니다.")

겹치는 라벨이 있는지 확인:
first_party와 second_party의 라벨 개수: 2110 1974
겹치는 라벨이 있습니다: {'Rodriguez', 'United States of America', 'Thomas', 'Wilson', 'Jenkins', 'United States, et al.', 'Jeffrey A. Beard, Secretary, Pennsylvania Department of Corrections', 'Boggs', 'Ricky Bell, Warden', 'Byrd', 'Doug Dretke, Director, Texas Department of Criminal Justice, Correctional Institutions Division', 'Ken L. Salazar, Secretary of the Interior, et al.', 'Eric H. Holder, Jr., Attorney General, et al.', 'Garcia', 'Hunt', 'Tennessee Valley Authority', 'Pacific Gas & Electric Company', 'Immigration and Naturalization Service', 'Golden Bethune-Hill, et al.', 'College Savings Bank', 'Washington', 'Shalala', 'Wood', 'Republic of Argentina', 'Lee', 'New York', 'AT&T Corporation', 'Gregory', 'City and County of San Francisco, California, et al.', 'Withrow', 'Wright', 'Murphy', 'Hustler Magazine, Inc.', 'Montana', 'Tara Sheneva Williams', 'Miller', 'International Society for Krishna Consciousness, Inc.', 'Nevada', 'Mor

In [7]:
intersection_rows = train[train['first_party'].isin(intersection) | train['second_party'].isin(intersection)]

print("겹치는 이름이 포함된 행:")
intersection_rows

겹치는 이름이 포함된 행:


Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
17,TRAIN_0017,Steven Spears,United States,"In 2004, Steven Spears was charged and convict...",1
18,TRAIN_0018,United States,Martinez-Salazar,Abel Martinez-Salazar was charged with a varie...,1
20,TRAIN_0020,Clay,United States,"Board No. 47, Louisville, Kentucky, denied the...",1
...,...,...,...,...,...
2466,TRAIN_2466,Monroe Ace Setser,United States,"On October 1, 2007, Lubbock police officers ar...",0
2471,TRAIN_2471,United States,James X. Bormes,"In October 2000, the United States Treasury De...",1
2472,TRAIN_2472,United States,Cuauhtemoc Gonzalez-Lopez,"Cuauhtemoc Gonzalez-Lopez hired Joseph Low, an...",0
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0


In [8]:
df = intersection_rows
df.to_csv('./submision/submission2.csv', index=False)
print('Done')

Done


In [9]:
# len_sentance 추가
train['len_sentance'] = df['facts'].apply(lambda x: len(x.split('.')))

# len_word 추가
train['len_word'] = df['facts'].apply(lambda x: len(x.split()))

# len_alpa 추가
train['len_alpa'] = df['facts'].apply(lambda x: len(''.join(filter(str.isalpha, x))))

In [10]:
train

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner,len_sentance,len_word,len_alpa
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1,,,
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0,,,
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1,,,
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0,5.0,55.0,296.0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1,11.0,177.0,860.0
...,...,...,...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1,,,
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1,,,
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0,8.0,171.0,780.0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0,13.0,167.0,862.0


## Data Preprocessing

In [None]:
##############
# 라벨인코딩    #
##############
from sklearn.preprocessing import LabelEncoder

# 겹치는 이름을 고려하여 라벨 인코딩을 수행합니다.
label_encoder = LabelEncoder()

# first_party와 second_party 열을 합쳐서 모든 라벨을 추출합니다.
all_labels = np.concatenate([train['first_party'], train['second_party'], test['first_party'], test['second_party']])

# 겹치는 라벨만 추출합니다.
intersecting_labels = list(set(all_labels) & set(intersection))

# 겹치는 라벨을 고려하여 전체 데이터에 대해 라벨 인코딩을 수행합니다.
label_encoder.fit(all_labels)

# 'first_party' 열의 라벨 인코딩
train['first_party_encoded'] = label_encoder.transform(train['first_party'])
test['first_party_encoded'] = label_encoder.transform(test['first_party'])

# 'second_party' 열의 라벨 인코딩
train['second_party_encoded'] = label_encoder.transform(train['second_party'])
test['second_party_encoded'] = label_encoder.transform(test['second_party'])

In [None]:
#################################
# Feature selection에 사용할 특성들 #
#################################

# Feature selection에 사용할 특성들
X_train = train[['first_party_encoded', 'second_party_encoded', 'facts']]
y_train = train['first_party_winner']
X_test = test[['first_party_encoded', 'second_party_encoded', 'facts']]

In [None]:
#########################
# facts 를 TF-IDF로 인코딩 #
#########################
vectorizer = TfidfVectorizer()
def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])
    
    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return pd.DataFrame(X)

In [None]:
X_train = get_vector(vectorizer, train, True)
Y_train = train["first_party_winner"]
X_test = get_vector(vectorizer, test, False)

## Define Model & Train

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression

In [None]:
# 설정
k = 10  # 원하는 fold의 수를 설정합니다.

# 모델 설정
model = LogisticRegression()

# KFold 교차 검증 객체를 생성합니다.
kf = KFold(n_splits=k, shuffle=True, random_state=1)

# 교차 검증을 수행합니다.
scores = cross_val_score(model, X_train, Y_train, cv=kf)

# 교차 검증 점수의 평균을 출력합니다.
print(f"Average cross-validation score: {scores.mean()}")


In [None]:
model = LogisticRegression()
model.fit(X_train, Y_train)

## Inference & Submission

In [None]:
submit = pd.read_csv('open/sample_submission.csv')
submit


In [None]:
pred = model.predict(X_test)

In [None]:
submit['first_party_winner'] = pred
submit.to_csv('./submision/submission3.csv', index=False)
print('Done')

# 디버그 존