Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

一个较大的模型在infer时出现 _mm256_storeu_ps(d + PACK_UNIT * 1, t1); #2755

Closed
postzhang123 opened this issue Feb 8, 2024 · 2 comments
Labels

Comments

@postzhang123
Copy link

平台(如果交叉编译请再附上交叉编译目标平台):

Platform(Include target platform as well if cross-compiling):

win10平台,64位和32位都出现该问题,
mnn的版本是2.7.2,
使用的模型比较大,大约是32MB的mnn文件,在进行推理的使用出现如下问题
image
问一下出现该问题是内存不够的原因吗?

@postzhang123
Copy link
Author

补充,该函数的整体代码,void _AVX_MNNPackCUnit(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) {
auto areaC4 = area / PACK_UNIT;
auto depthC4 = depth / PACK_UNIT;
auto srcAreaOffset = areaOffset[0];
auto dstAreaOffset = areaOffset[1];
__m256 t0, t1, t2, t3, t4, t5, t6, t7;
for (int z = 0; z < depthC4; ++z) {
auto dstPlane = dst + z * dstAreaOffset * PACK_UNIT;
auto srcPlane = src + z * srcAreaOffset * PACK_UNIT;
for (int x = 0; x < areaC4; ++x) {
auto s = srcPlane + PACK_UNIT * x;
auto d = dstPlane + PACK_UNIT * PACK_UNIT * x;
auto r0 = _mm256_loadu_ps(s + 0 * srcAreaOffset);
auto r1 = _mm256_loadu_ps(s + 1 * srcAreaOffset);
auto r2 = _mm256_loadu_ps(s + 2 * srcAreaOffset);
auto r3 = _mm256_loadu_ps(s + 3 * srcAreaOffset);
auto r4 = _mm256_loadu_ps(s + 4 * srcAreaOffset);
auto r5 = _mm256_loadu_ps(s + 5 * srcAreaOffset);
auto r6 = _mm256_loadu_ps(s + 6 * srcAreaOffset);
auto r7 = _mm256_loadu_ps(s + 7 * srcAreaOffset);

        TRANSPOSE_8x8;

        _mm256_storeu_ps(d + PACK_UNIT * 0, t0);
        _mm256_storeu_ps(d + PACK_UNIT * 1, t1);
        _mm256_storeu_ps(d + PACK_UNIT * 2, t2);
        _mm256_storeu_ps(d + PACK_UNIT * 3, t3);
        _mm256_storeu_ps(d + PACK_UNIT * 4, t4);
        _mm256_storeu_ps(d + PACK_UNIT * 5, t5);
        _mm256_storeu_ps(d + PACK_UNIT * 6, t6);
        _mm256_storeu_ps(d + PACK_UNIT * 7, t7);
    }
}
auto areaRemain  = areaC4 * PACK_UNIT;
auto depthRemain = depthC4 * PACK_UNIT;
// Down
int remain = depth - depthRemain;
if (remain > 0) {
    float* dstPlane       = depthC4 * dstAreaOffset * PACK_UNIT + dst;
    const float* srcPlane = src + depthC4 * srcAreaOffset * PACK_UNIT;
    {
        for (int x = 0; x < areaC4; ++x) {
            auto s  = srcPlane + PACK_UNIT * x;
            auto d  = dstPlane + PACK_UNIT * PACK_UNIT * x;
            auto r0 = _mm256_loadu_ps(s + 0 * srcAreaOffset);
            auto r1 = _mm256_setzero_ps();
            auto r2 = _mm256_setzero_ps();
            auto r3 = _mm256_setzero_ps();
            auto r4 = _mm256_setzero_ps();
            auto r5 = _mm256_setzero_ps();
            auto r6 = _mm256_setzero_ps();
            auto r7 = _mm256_setzero_ps();
            switch (remain) {
                case 7:
                    r6 = _mm256_loadu_ps(s + 6 * srcAreaOffset);
                case 6:
                    r5 = _mm256_loadu_ps(s + 5 * srcAreaOffset);
                case 5:
                    r4 = _mm256_loadu_ps(s + 4 * srcAreaOffset);
                case 4:
                    r3 = _mm256_loadu_ps(s + 3 * srcAreaOffset);
                case 3:
                    r2 = _mm256_loadu_ps(s + 2 * srcAreaOffset);
                case 2:
                    r1 = _mm256_loadu_ps(s + 1 * srcAreaOffset);
                default:
                    break;
            }

            TRANSPOSE_8x8;

            _mm256_storeu_ps(d + PACK_UNIT * 7, t7);
            _mm256_storeu_ps(d + PACK_UNIT * 6, t6);
            _mm256_storeu_ps(d + PACK_UNIT * 5, t5);
            _mm256_storeu_ps(d + PACK_UNIT * 4, t4);
            _mm256_storeu_ps(d + PACK_UNIT * 3, t3);
            _mm256_storeu_ps(d + PACK_UNIT * 2, t2);
            _mm256_storeu_ps(d + PACK_UNIT * 1, t1);
            _mm256_storeu_ps(d + PACK_UNIT * 0, t0);
        }
    }
    for (int x = areaRemain; x < area; ++x) {
        for (int y = 0; y < remain; y++) {
            dstPlane[PACK_UNIT * x + y] = srcPlane[y * srcAreaOffset + x];
        }
        for (int y = remain; y < PACK_UNIT; y++) {
            dstPlane[PACK_UNIT * x + y] = 0;
        }
    }
}
// Right
for (int z = 0; z < depthC4; ++z) {
    float* dstPlane       = z * dstAreaOffset * PACK_UNIT + dst;
    const float* srcPlane = src + z * srcAreaOffset * PACK_UNIT;
    for (int x = areaRemain; x < area; ++x) {
        float s0 = srcPlane[x];
        float s1 = srcPlane[x + srcAreaOffset];
        float s2 = srcPlane[x + srcAreaOffset * 2];
        float s3 = srcPlane[x + srcAreaOffset * 3];
        float s4 = srcPlane[x + srcAreaOffset * 4];
        float s5 = srcPlane[x + srcAreaOffset * 5];
        float s6 = srcPlane[x + srcAreaOffset * 6];
        float s7 = srcPlane[x + srcAreaOffset * 7];
        _mm256_storeu_ps(dstPlane + PACK_UNIT * x, _mm256_set_ps(s7, s6, s5, s4, s3, s2, s1, s0));
    }
}

}

@postzhang123 postzhang123 changed the title 一个较大的模型在infer时开在 _mm256_storeu_ps(d + PACK_UNIT * 1, t1); 一个较大的模型在infer时出现 _mm256_storeu_ps(d + PACK_UNIT * 1, t1); Feb 8, 2024
Copy link

github-actions bot commented Apr 8, 2024

Marking as stale. No activity in 60 days.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

1 participant