In [18]:
import pandas as pd
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 1000)

df = pd.read_spss('/Users/apple/Documents/GitHub/TJ-sem-project/data0612_family.sav')


In [None]:
df
print(df.head())

# date cleaning

In [None]:
# Rename variables

df = df.rename(columns={'核心家庭': 'coresidence', '年收入': 'income', '最高学历': 'edu', '年龄': 'age'})
print(df.head())



## coresidence

In [None]:
# Recode coresidence核心家庭
coresidence_counts = df['coresidence'].value_counts()
print("coresidence_counts:\n", coresidence_counts)

def recode(coresidence):
    if coresidence == '核心家庭':
        return 1
    else:
        return 0
df['coresidence_recode'] = df['coresidence'].apply(recode)
df.head()



## education

In [None]:
# Recode education
edu_counts = df['edu'].value_counts()
print("edu_counts:\n", edu_counts)

edu_recode_dict = {
    '小学及以下': 1,
    '初中': 1,
    '高中': 2,
    '大专': 3,
    '本科': 4,
    '硕士': 4,
    '博士': 4
}

edu_label_dict = {
    1: 'middle school and below',
    2: 'high school',
    3: 'associate',
    4: 'bachelor and above'
}

df['edu_recode'] = df['edu'].map(edu_recode_dict)
df['edu_label'] = df['edu_recode'].map(edu_label_dict)

print(df)


## age

In [None]:
# Recode age
min_age = df['age'].min()
max_age = df['age'].max()

print(f"Minimum age: {min_age}")
print(f"Maximum age: {max_age}")

# Install matplotlib package
%pip install matplotlib
%pip install seaborn

import matplotlib.pyplot as plt
import seaborn as sns

# Set the style of the plot
sns.set_theme(style="whitegrid")

# Create a histogram of the 'age' variable
plt.figure(figsize=(8, 6))
sns.histplot(df['age'], bins=15, kde=True, color='skyblue')

# Set plot labels and title
plt.title('Distribution of Age', fontsize=16)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Show the plot
plt.show()

#count over 40
count_40above = (df['age'] >= 40).sum()
print(count_40above)

#count between 20 to 29
count_20to25 = ((df['age'] >= 20) & (df['age'] < 25)).sum()
print(count_20to25)

count_25to30 = ((df['age'] >= 25) & (df['age'] < 30)).sum()
print(count_25to30)

# age group
def categorize_age(age):
    if 20 <= age <= 24:
        return 1, "age 20 to 24"
    elif 25 <= age <= 29:
        return 2, "age 25 to 29"
    elif 30 <= age <= 34:
        return 3, "age 30 to 34"
    elif 35 <= age <= 39:
        return 4, "age 35 to 40"
    else:
        return 5, "age above 40"

df[['age_group', 'age_group']] = df['age'].apply(lambda x: pd.Series(categorize_age(x)))

print(df)


## Income

In [None]:
income_counts = df['income'].value_counts()
print("income_counts:\n", income_counts)

# Convert 'income' column to an ordered categorical type
income_order = ['1万以下', '1-3.9万', '4-6.9万', '7-9.9万', '10-19万', '20-49万', '50-99万', '100-199万', '200万以上']
df['income'] = pd.Categorical(df['income'], categories=income_order, ordered=True)

min_income = df['income']31 .min()
max_income = df['income'].max()

print(f"Minimum income: {min_income}")
print(f"Maximum income: {max_income}")


income_counts:
 income
10-19万      297
7-9.9万      234
4-6.9万      164
20-49万      116
1-3.9万       99
1万以下         34
50-99万       23
100-199万     10
200万以上        1
Name: count, dtype: int64
Minimum income: 1万以下
Maximum income: 200万以上


In [None]:
def clean_data(df):
    # Change column type to category for column: 'edu_recode'
    df = df.astype({'edu_recode': 'category'})
    # Change column type to category for column: 'age_group'
    df = df.astype({'age_group': 'category'})
    # Change column type to category for column: 'coresidence_recode'
    df = df.astype({'coresidence_recode': 'category'})
    return df

df_clean = clean_data(df.copy())
df_clean.head()

# sem modeling

In [None]:
# Install semopy package
%pip install semopy

# SEM modeling testing:
    #orginal model filial piety, satisfaction, anxiety
from semopy import Model

desc = """
    filial =~ XS1 + XS2 + XS3 + XS4 + XS5 + XS6 + XS7 + XS8
    satisfaction =~ JT1 + JT2 + JT3 + JT4 + JT5
    anxiety =~ JL1 + JL2 + JL3 +JL4 +JL5 +JL6 +JL7 +JL8 + JL9
    coresidence =~
"""
model = Model(desc)
opt_res = model.fit(df)
estimates = model.inspect()

print(estimates)

#filial ~ satisfaction: 0.243 (p < 0.001)
#filial ~ anxiety: 0.443 (p < 0.001) moderate relationship.
#satisfaction ~ anxiety: -0.006 (p = 0.775)