<a href="https://colab.research.google.com/github/ZHUTING0522/line_broadcast_msg/blob/main/Chiikawa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-learn matplotlib mecab-python3

In [None]:
!apt-get install -y mecab libmecab-dev mecab-ipadic-utf8

In [3]:
import pandas as pd
import re
import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [None]:
from google.colab import files
uploaded = files.upload()

In [13]:
def preprocess_text(text):
    """Preprocess the input text for Japanese text analysis."""
    mecab = MeCab.Tagger("-Owakati")  # Tokenizer for Japanese
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[\d\W]+', ' ', text)  # Remove non-alphanumeric characters
    text = mecab.parse(text).strip()  # Tokenize using MeCab
    return text

# Load data (CSV should have columns: 'content' and 'sentiment')
def load_and_preprocess_data(filename):
    df = pd.read_csv(filename)
    df['cleaned_content'] = df['content'].apply(preprocess_text)
    return df

In [14]:
def extract_features(data, max_features=5000):
    vectorizer = TfidfVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(data['cleaned_content'])
    return X, vectorizer

In [15]:
def train_and_evaluate_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    return model

In [16]:
def predict_sentiment(text, model, vectorizer):
    cleaned_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([cleaned_text])
    return model.predict(vectorized_text)[0]

In [17]:
def visualize_sentiment_distribution(data):
    sentiment_counts = data['sentiment'].value_counts()
    plt.figure(figsize=(8, 6))
    sentiment_counts.plot.pie(autopct='%1.1f%%', startangle=140, colors=['#66c2a5', '#fc8d62', '#8da0cb'])
    plt.title('Sentiment Distribution')
    plt.ylabel('')
    plt.show()

In [None]:
def main():
    # Load and preprocess data
    filename = 'chiikawa_posts (1).csv'  # Replace with your CSV file path
    df = pd.read_csv(filename) # This line was missing, causing an error. You need to load the DataFrame first.
    df['cleaned_content'] = df['content'].apply(preprocess_text)
    # The 'return df' statement was here, causing the issue. Removing it allows the rest of the function to execute.

    # Feature extraction
    X, vectorizer = extract_features(df)
    y = df['sentiment']

    # Train and evaluate model
    model = train_and_evaluate_model(X, y)

    # Visualize sentiment distribution
    visualize_sentiment_distribution(df)

    # Example prediction
    new_post = "ちいかわの新しいグッズは素敵すぎる！"
    sentiment = predict_sentiment(new_post, model, vectorizer)
    print(f"Predicted sentiment for the post: {sentiment}")
    return df # Moved the return statement to the end of the function

if __name__ == "__main__":
    main()